diff --git a/.gitignore b/.gitignore index d6cff24c..52940f9b 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ .*.swp _build/ +doc/ *.bak setup.data setup.log diff --git a/.ocamlformat b/.ocamlformat index 27ab73c3..7d450c63 100644 --- a/.ocamlformat +++ b/.ocamlformat @@ -1,3 +1,7 @@ -version=0.26.2 -profile=janestreet -ocaml-version=4.08.0 +profile = default +break-cases = fit +margin = 80 +parse-docstrings = true +wrap-comments = false +doc-comments = before +max-indent=2 diff --git a/benchmarks/unicode/benchmark_unicode.ml b/benchmarks/unicode/benchmark_unicode.ml new file mode 100644 index 00000000..03db9bed --- /dev/null +++ b/benchmarks/unicode/benchmark_unicode.ml @@ -0,0 +1,214 @@ +open Core +open Core_bench +open Re_unicode.Utf8 + +let str_20_zeroes = String.make 20 '0' +let re_20_zeroes = Re.(str str_20_zeroes) +let lots_of_a's = String.init 101 ~f:(function 100 -> 'b' | _ -> 'a') + +let lots_o_a's_re = + Re.(seq [ char 'a' |> letter; opt (char 'a' |> letter); char 'b' |> letter ]) + +let media_type_re = + let re = Re.Emacs.re ~case:true "[ \t]*\\([^ \t;]+\\)" in + Re.(seq [ start; re ]) + +(* Taken from https://github.com/rgrinberg/ocaml-uri/blob/903ef1010f9808d6f3f6d9c1fe4b4eabbd76082d/lib/uri.ml*) +let uri_reference = + Re.Posix.re "^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\\?([^#]*))?(#(.*))?" + +let uris = + [ + "https://google.com"; + "http://yahoo.com/xxx/yyy?query=param&one=two"; + "file:/random_crap"; + ] + +let benchmarks = + [ + ("20 zeroes", re_20_zeroes, [ str_20_zeroes ]); + ("lots of a's", lots_o_a's_re, [ lots_of_a's ]); + ("media type match", media_type_re, [ " foo/bar ; charset=UTF-8" ]); + ("uri", uri_reference, uris); + ] + +let test ~name re f = + [ + Bench.Test.create ~name (fun () -> f re); + (let re () = + let re = lazy (re ()) in + Lazy.force re + in + Bench.Test.create ~name:(sprintf "%s (compiled)" name) (fun () -> f re)); + ] + +let exec_bench exec name (re : Re.t) cases = + Bench.Test.create_group ~name + (List.concat_map cases ~f:(fun data -> + let name = + let len = String.length data in + if len > 70 then + Printf.sprintf "%s .. (%d)" (String.sub data ~pos:0 ~len:10) len + else data + in + let re () = Re.compile re in + test ~name re (fun re -> ignore (exec (re ()) data)))) + +let exec_bench_many exec name re cases = + test ~name + (fun () -> Re.compile re) + (fun re -> + let re = re () in + List.iter cases ~f:(fun x -> ignore (exec re x))) + +let string_traversal = + let len = 1000 * 1000 in + let s = String.make len 'a' in + let re = + let re = Re.Pcre.re "aaaaaaaaaaaaaaaaz" in + fun () -> Re.compile re + in + test ~name:"string traversal from #210" re (fun re -> + ignore (Re.execp (re ()) s ~pos:0)) + +let compile_clean_star = + let c = 'c' in + let s = String.make 10_000 c in + let re = Re.rep (Re.char 'c' |> Re.letter) in + let re () = Re.compile re in + test ~name:"kleene star compilation" re (fun re -> + ignore (Re.execp (re ()) s)) + +let gen_chars n = + let bytes = Bytes.create (n * 4) in + let pos = ref 0 in + for i = 0 to n - 1 do + pos := + !pos + + Cset.Codec.set bytes !pos (Cset.CodePage.of_char @@ Stdlib.Char.chr i) + done; + Stdlib.Bytes.sub_string bytes 0 !pos + +let repeated_sequence = + let s = gen_chars 256 in + let re () = Re.repn (Re.str s) 50 (Some 50) |> Re.compile in + let s = List.init 50 ~f:(fun _ -> s) |> String.concat ~sep:"" in + test ~name:"repeated sequence re" re (fun re -> + let re = re () in + ignore (Re.execp re s)) + +let split = + let s = Bytes.make 1_000 '_' in + for i = 0 to 100 do + Bytes.set s (i * 9) ' ' + done; + let s = Bytes.to_string s in + let re () = Re.(rep1 space |> compile) in + test ~name:"split on whitespace" re (fun re -> + ignore (Re.split_full (re ()) s)) + +let prefixes = + let make_ext = + let chars = "abcdefghiklmnopqrstuvwxyz" in + let buf = Buffer.create 4 in + let rec loop remains = + match remains with + | 0 -> Buffer.contents buf + | _ -> + let char = remains mod String.length chars in + Buffer.add_char buf chars.[char]; + loop (remains / String.length chars) + in + fun n -> + Buffer.clear buf; + loop n + in + let n_extensions = 100 in + let n_base = 20 in + let base = String.make n_base 'x' ^ "." in + let extensions = List.init n_extensions ~f:make_ext in + let re () = + (* This regular expression can be heavily optimized by computing the shared prefix *) + List.init 100 ~f:(fun i -> + let ext = make_ext i in + let open Re in + seq [ rep1 any; char '.' |> letter; str ext ]) + |> Re.alt |> Re.compile + in + let extensions = Array.of_list extensions in + test ~name:"shared prefixes" re (fun re -> + let re = re () in + for i = 0 to Array.length extensions - 1 do + let extension = extensions.(i) in + let str = base ^ extension in + ignore (Re.execp re str) + done) + +let benchmarks = + let benches = + List.map benchmarks ~f:(fun (name, re, cases) -> + Bench.Test.create_group ~name + [ + exec_bench Re.exec "exec" re cases; + exec_bench Re.execp "execp" re cases; + exec_bench Re.exec_opt "exec_opt" re cases; + ]) + in + let http_benches = + let open Http.Export in + let manual = + [ (request, "no group"); (request_g, "group") ] + |> List.concat_map ~f:(fun (re, name) -> + let re () = Re.compile re in + test ~name re (fun re -> + let re = re () in + Http.read_all 0 re Http.requests)) + |> Bench.Test.create_group ~name:"manual" + in + let many = + [ + test ~name:"execp no group" + (fun () -> Re.compile requests) + (fun re -> ignore (Re.execp (re ()) Http.requests)); + test ~name:"all_gen" + (fun () -> Re.compile requests_g) + (fun re -> Http.requests |> Re.all (re ())); + ] + |> List.concat + |> Bench.Test.create_group ~name:"auto" + in + Bench.Test.create_group ~name:"http" [ manual; many ] + in + benches + @ [ + [ + exec_bench_many Re.execp "execp"; exec_bench_many Re.exec_opt "exec_opt"; + ] + |> List.concat_map ~f:(fun f -> f Tex.ignore_re Tex.ignore_filesnames) + |> Bench.Test.create_group ~name:"tex gitignore"; + ] + @ [ http_benches ] @ string_traversal @ compile_clean_star @ Memory.benchmarks + @ repeated_sequence @ split @ prefixes + +let () = + let benchmarks = + match Sys.getenv "RE_BENCH_FILTER" with + | None -> benchmarks + | Some only -> ( + let only = String.split ~on:',' only in + let filtered = + List.filter benchmarks ~f:(fun bench -> + let name = Bench.Test.name bench in + List.mem only name ~equal:String.equal) + in + match filtered with + | _ :: _ -> filtered + | [] -> + print_endline "No benchmarks to run. Your options are:"; + List.iter benchmarks ~f:(fun bench -> + let name = Bench.Test.name bench in + Printf.printf "- %s\n" name); + exit 1) + in + Memtrace.trace_if_requested (); + Command_unix.run (Bench.make_command benchmarks) diff --git a/benchmarks/unicode/compare_unicode.ml b/benchmarks/unicode/compare_unicode.ml new file mode 100644 index 00000000..2551d09b --- /dev/null +++ b/benchmarks/unicode/compare_unicode.ml @@ -0,0 +1,182 @@ +open Core + +module Both = struct + type 'a t = + { lhs : 'a + ; rhs : 'a + } +end + +module Value = struct + type t = + | Int of int + | Float of float + + let of_string s = + try Int (Int.of_string s) with + | _ -> Float (Float.of_string s) + ;; + + let rec percent_delta x y = + match x, y with + | Int x, Int y -> + let delta = y - x in + let open Float in + Float (100. * Float.of_int delta / Float.of_int x) + | Float x, Float y -> Float Float.(100. * (y - x) / x) + | Float x, Int y -> percent_delta (Float x) (Float (Float.of_int y)) + | Int x, Float y -> percent_delta (Float (Float.of_int x)) (Float y) + ;; + + let to_csv t = + match t with + | Float f -> Float.to_string_hum f + | Int x -> Int.to_string_hum x + ;; + + let compare x y = + match x, y with + | Float x, Float y -> Float.compare x y + | Int x, Int y -> Int.compare x y + | _, _ -> assert false + ;; +end + +type 'a bench = + { name : string + ; time_per_run_nanos : 'a + ; major_words_per_run : 'a + ; promoted_words_per_run : 'a + ; minor_words_per_run : 'a + } + +let of_sexp (sexp : Sexp.t) = + match sexp with + | Atom _ -> failwith "expected list" + | List fields -> + let kv (sexp : Sexp.t) = + match sexp with + | List [ Atom k; Atom v ] -> Some (k, v) + | _ -> None + in + let fields = List.filter_map fields ~f:kv in + let field name = + List.find_map_exn fields ~f:(fun (k, v) -> + if String.equal k name then Some v else None) + in + let name = field "full_benchmark_name" in + let time_per_run_nanos = Value.of_string (field "time_per_run_nanos") in + let major_words_per_run = Value.of_string (field "major_words_per_run") in + let promoted_words_per_run = Value.of_string (field "promoted_words_per_run") in + let minor_words_per_run = Value.of_string (field "minor_words_per_run") in + { name + ; time_per_run_nanos + ; major_words_per_run + ; promoted_words_per_run + ; minor_words_per_run + } +;; + +let parse_all s = + match Sexp.of_string s with + | Atom _ -> failwith "list expected" + | List benches -> + List.map benches ~f:of_sexp + |> String.Map.of_list_with_key_exn ~get_key:(fun v -> v.name) +;; + +let merge_one + { name + ; time_per_run_nanos + ; major_words_per_run + ; promoted_words_per_run + ; minor_words_per_run + } + b + = + assert (String.equal name b.name); + { b with + time_per_run_nanos = { Both.lhs = time_per_run_nanos; rhs = b.time_per_run_nanos } + ; major_words_per_run = { Both.lhs = major_words_per_run; rhs = b.major_words_per_run } + ; promoted_words_per_run = + { Both.lhs = promoted_words_per_run; rhs = b.promoted_words_per_run } + ; minor_words_per_run = { Both.lhs = minor_words_per_run; rhs = b.minor_words_per_run } + } +;; + +let merge lhs rhs = + Map.merge lhs rhs ~f:(fun ~key:_ v -> + match v with + | `Left _ -> None + | `Right _ -> None + | `Both (lhs, rhs) -> Some (merge_one lhs rhs)) +;; + +let run ~prev ~next = + let report = + let prev = Stdio.In_channel.read_all prev |> parse_all in + let next = Stdio.In_channel.read_all next |> parse_all in + merge prev next + in + let records = + let headers = + [ "name" + ; "time_per_run_nanos" + ; "delta (%)" + ; "major_words_per_run" + ; "delta (%)" + ; "promoted_words_per_run" + ; "delta (%)" + ; "minor_words_per_run" + ; "delta (%)" + ] + in + let values = + Map.to_alist report + |> List.map ~f:snd + |> List.map + ~f: + (fun + ({ name + ; time_per_run_nanos + ; major_words_per_run + ; promoted_words_per_run + ; minor_words_per_run + } : + Value.t Both.t bench) + -> + let time_delta = + Value.percent_delta time_per_run_nanos.lhs time_per_run_nanos.rhs + in + let make_delta { Both.lhs; rhs } = + let delta = Value.percent_delta lhs rhs in + [ Value.to_csv lhs; Value.to_csv delta ] + in + ( time_delta + , name + :: List.concat + [ make_delta time_per_run_nanos + ; make_delta major_words_per_run + ; make_delta promoted_words_per_run + ; make_delta minor_words_per_run + ] )) + |> List.sort ~compare:(fun (x, _) (y, _) -> Value.compare x y) + |> List.map ~f:snd + in + headers :: values + in + let chan = Csv.to_channel Stdio.stdout in + Csv.output_all chan records +;; + +let command = + let open Command.Param in + let open Command.Param.Applicative_infix in + Command.basic + ~summary:"compare two runs" + (let prev = flag "prev" (required string) ~doc:"sexp file" in + let next = flag "next" (required string) ~doc:"sexp file" in + Command.Param.return (fun prev next () -> run ~prev ~next) <*> prev <*> next) +;; + +let () = Command_unix.run command diff --git a/benchmarks/unicode/dune b/benchmarks/unicode/dune new file mode 100644 index 00000000..1925ada0 --- /dev/null +++ b/benchmarks/unicode/dune @@ -0,0 +1,35 @@ +(env + (dev + (flags + (:standard -w -58)))) + +(executables + (enabled_if + (not %{env:CI=false})) + (libraries + re.unicode + core + base + stdio + threads + core_bench + core_unix.command_unix + memtrace) + (modules :standard \ compare_unicode) + (names benchmark_unicode)) + +(executable + (enabled_if + (not %{env:CI=false})) + (name compare_unicode) + (modules compare_unicode) + (libraries + core + csv + base + core_unix + core_unix.command_unix + core_unix.filename_unix + spawn + stdio + sexplib)) diff --git a/benchmarks/unicode/files b/benchmarks/unicode/files new file mode 100644 index 00000000..886df22a --- /dev/null +++ b/benchmarks/unicode/files @@ -0,0 +1,1120 @@ +./.merlin +./re_match.native +./lib +./lib/re-api.odocl +./lib/re_glob.mllib +./lib/re.mli +./lib/re_cset.ml +./lib/re_automata.ml +./lib/re.ml +./lib/re_emacs.mli +./lib/re_glob.mldylib +./lib/re_pcre.mli +./lib/re_perl.mldylib +./lib/re_posix.mli +./lib/re_perl.mli +./lib/re.mldylib +./lib/re_posix.mldylib +./lib/re_automata.mli +./lib/re_glob.ml +./lib/re_emacs.ml +./lib/re_pcre.mllib +./lib/re_glob.mli +./lib/re_str.mldylib +./lib/re.mllib +./lib/re_posix.mllib +./lib/re_pcre.ml +./lib/re_emacs.mldylib +./lib/META +./lib/re_pcre.mldylib +./lib/re_cset.mli +./lib/re_emacs.mllib +./lib/re_str.mli +./lib/re_perl.ml +./lib/re_fmt.ml +./lib/re_perl.mllib +./lib/re_posix.ml +./lib/re_str.ml +./lib/re_str.mllib +./lib_test +./lib_test/test_easy.ml +./lib_test/test_emacs.ml +./lib_test/re_match.ml +./lib_test/longest.c +./lib_test/pcre_match.ml +./lib_test/.cvsignore +./lib_test/test_glob.ml +./lib_test/fort_unit.mllib +./lib_test/test_perl.ml +./lib_test/fort_unit.ml +./lib_test/test_exec_iter.ml +./lib_test/perl_scan.pl +./lib_test/Input +./lib_test/scan.ml +./lib_test/fort_unit.mldylib +./lib_test/unison2.ml +./lib_test/META +./lib_test/unison.ml +./lib_test/test_pcre.ml +./lib_test/glob.ml +./lib_test/pcre_scan.ml +./lib_test/re_scan.ml +./lib_test/Makefile +./lib_test/test_str.ml +./lib_test/test_re.ml +./lib_test/unison3.ml +./setup.exe +./_oasis +./repl.ml +./test_perl.native +./setup.data +./test_re.native +./setup.ml +./benchmarks +./benchmarks/.merlin +./benchmarks/benchmark.ml +./benchmarks/.#files +./benchmarks/.#tex.gitignore +./.gitignore +./.git +./.git/COMMIT_EDITMSG +./.git/objects +./.git/objects/bd +./.git/objects/bd/7b4a58cc13ca497618e5f8eaa664727a489c14 +./.git/objects/04 +./.git/objects/04/a2c275e68174559191e1f03369472ab6cc79a6 +./.git/objects/04/266b2bfdc3cb6dd1557ddc6cca27dd1c2483de +./.git/objects/c3 +./.git/objects/c3/b0f3506b6efa82e7d8455a51c40f674e362019 +./.git/objects/ae +./.git/objects/ae/edba64f100def873a670083e7cc4dec013b461 +./.git/objects/ae/e951b55c41573ce87f3e1e9ed7af59cfd03465 +./.git/objects/ae/6fb476af3ed0c1c09bac3cca8825a1983f6baf +./.git/objects/80 +./.git/objects/80/c9e3062a7a04de5f4fac4e99402caa9b51a06a +./.git/objects/80/c8e9cfeb808e093b68db94016ed90c821111f7 +./.git/objects/pack +./.git/objects/pack/pack-866532c1406fa054e3eb4a18c69f02811d0d9218.pack +./.git/objects/pack/pack-866532c1406fa054e3eb4a18c69f02811d0d9218.idx +./.git/objects/ed +./.git/objects/ed/0f521c10cf8d6bff59836ec4ccbe7e5457833b +./.git/objects/7a +./.git/objects/7a/93d826a964cfcda511666ea90cae1c932fa420 +./.git/objects/7a/948d869ac1991341c6a809fff33fe6f7578176 +./.git/objects/7a/11dea700ff573997fa79ee6d720d8c36bb8c24 +./.git/objects/57 +./.git/objects/57/41501919c83c417773c4db93c40f064c01b9fa +./.git/objects/57/5df2075ef86ab5052cf3f640cfef0ade065079 +./.git/objects/c1 +./.git/objects/c1/b86cbaa2bc74068aa01996d5361165488ad90b +./.git/objects/d5 +./.git/objects/d5/fae05fd2c4e4319d1d8b72b47c5bf7c5c988bc +./.git/objects/d5/a493ad0448a01ef3b090cafa65e4052e3e3443 +./.git/objects/d5/18889bb8e29093ac8c151dbfb94ecdb55da37b +./.git/objects/c0 +./.git/objects/c0/3edfcb9c4d1ba4d5c714784719f2babc0d4bc2 +./.git/objects/ea +./.git/objects/ea/59fc8cae70bc30d4925b5d754d3eaa4c86fbe7 +./.git/objects/ea/96d0c79a09edde6fb5fdca45fad5eeb6b94a3b +./.git/objects/ea/d533485541976b0e8651823b48dd3b893fd7bf +./.git/objects/bf +./.git/objects/bf/9238d7470f96c20ef3b8ae9ae800c29a24b7d0 +./.git/objects/da +./.git/objects/da/5d5f8e2a5adf35ea1918972f87bae5ef20eba8 +./.git/objects/da/9d6c16f47466f0483a426802c81a4e2f105565 +./.git/objects/5d +./.git/objects/5d/68a2c24d022272086e02e201a34a7cc471a810 +./.git/objects/d1 +./.git/objects/d1/baa7c6cb5a197c2fa16d74d015cb65383befae +./.git/objects/d1/4a2f9a73098a607be995953a176a4c158eda31 +./.git/objects/a8 +./.git/objects/a8/385bd4963671a9f8e46367258ed74470230be8 +./.git/objects/a8/88411f7ae15f92b0a4f115b6a198bb9fca17eb +./.git/objects/a8/6354c21c4ea14488a7749ce9ca7319dfd7962d +./.git/objects/f8 +./.git/objects/f8/402f7182db519f40022913d9b7f545155e23da +./.git/objects/f8/fe27353f98e6449f9c140ed285c66111a83c8a +./.git/objects/32 +./.git/objects/32/7e89b0ce8c6c488a6052e2de5204ef89e2f507 +./.git/objects/32/46e88c270717153eee687e584c48c2b7611500 +./.git/objects/d3 +./.git/objects/d3/4bab001c846b49c144845ae06905122fc00d5b +./.git/objects/d3/14cca150738f34ec06d0a620746fa4b664d488 +./.git/objects/d3/c1b11ad75157f9b7976205c02c5017edbe91e5 +./.git/objects/7c +./.git/objects/7c/4fff7192504e558898de1a0a6a03f40b1ba383 +./.git/objects/7c/ca263ea8cf851ed5e6b620b1ac975a52cd1ab9 +./.git/objects/7c/c8bf775e67bee805e45e2c2de7292c060f31ce +./.git/objects/e3 +./.git/objects/e3/fee6d0fba12db992dab0c59f0c0af51e00f0cc +./.git/objects/73 +./.git/objects/73/e55a4522cb6094dbfa5259ea1246a80e3b8c40 +./.git/objects/73/7cc095a1c47d9895cf38d27633f8fde7024868 +./.git/objects/73/afa2a0065c795b73f8ac64879d85342765624a +./.git/objects/48 +./.git/objects/48/85f690a7e2bbf6c95ab017ebeddf13d4adbf01 +./.git/objects/48/a13cb1beef406dd4a4e2306e1e37f6836e4825 +./.git/objects/8c +./.git/objects/8c/a9e8d133eef4f47c05922233619de5a86ee4ab +./.git/objects/8c/df1c29071e45af80bf897cac73b4c1566e2d50 +./.git/objects/1b +./.git/objects/1b/9d5c43861498c1854465ac03e76a514d5cbaa8 +./.git/objects/50 +./.git/objects/50/ef8ba0f5817ad270ae921fea07bb8797b9c267 +./.git/objects/50/48505dcfbfd75f4bbad53bf5d71105215052c7 +./.git/objects/eb +./.git/objects/eb/31ab22a61ecfba506504fdf4b0f5cd28af2e11 +./.git/objects/eb/bf41561fdfffa8b973ad8c8d60c00907c59571 +./.git/objects/eb/dbd748cd4c6e3fcadd5087b9508a20a34c8125 +./.git/objects/ef +./.git/objects/ef/5f4e84fc1fb16d0e6a3b6402b2ef7b4ed4597f +./.git/objects/ca +./.git/objects/ca/67f392b19172f6bc90132ef568aeadff4c47e4 +./.git/objects/ca/9cd9a973452dcedfabe0bb1ee39eaa825856b7 +./.git/objects/ca/0249ae201624a0915b20a04f0bea1e88fbac12 +./.git/objects/c5 +./.git/objects/c5/531af344c2046d8887b5a8414fca0034be08c4 +./.git/objects/c5/c32cce665964562c01cb70dbd1d931d0fc9013 +./.git/objects/c5/dd991dc612c5cd22a26633a5ab5cb1e19065c3 +./.git/objects/75 +./.git/objects/75/c564016f8ebf5726272c3f624f8f607002bbda +./.git/objects/4e +./.git/objects/4e/f45ea3fd278fb84e61bb004f62beeb6e58a461 +./.git/objects/4e/af84d38158ff6aeab975f4b93bca267185ff75 +./.git/objects/cb +./.git/objects/cb/a0873bb0fb8cdd095efea7fc00d244d9f33dd6 +./.git/objects/6a +./.git/objects/6a/00cd81ccdbe607f9585878296c1e1e2c5b7cbd +./.git/objects/6a/93ab65f245de47a5a6c97f6b95e2b5ed93c6ce +./.git/objects/6a/521ac0a70a030b4570b17ce369ae7e0ccb5bcc +./.git/objects/6a/00c91ef0a06a2aa19cfb94ba06503cb1d0a2c5 +./.git/objects/e8 +./.git/objects/e8/4dc23cd8d2b83c0986002851ad228ca86f104b +./.git/objects/e8/ad3cd2087d55ff7928f7ad191a5c447a86a253 +./.git/objects/06 +./.git/objects/06/1a15896701e0696b9056e8c1f3ed0602b696c7 +./.git/objects/06/8886279674c32c6c0189ca0eb9ca3427b6a73f +./.git/objects/06/96d3afa20ddbad8c4e09ffc80da6a062ecca0f +./.git/objects/26 +./.git/objects/26/d94482417d0415bc3074d578ea15fa29bf61eb +./.git/objects/51 +./.git/objects/51/21ef81a3c018e7ece82b8fa53804f49b187e61 +./.git/objects/51/7475c493f88584541456158468830c728a385f +./.git/objects/f7 +./.git/objects/f7/e5d4ab3ab0b9b806b17f166db3de7cb83e16ba +./.git/objects/49 +./.git/objects/49/b13c62775b3f1ae207c1c7b937b8799d280a53 +./.git/objects/49/9613067efd80e477704c5752e2e24d16b685e5 +./.git/objects/e7 +./.git/objects/e7/71a89a26f22e59b14d4537f341206ae7e2dd59 +./.git/objects/5c +./.git/objects/5c/28f300f108705b0838519b16b64040e1753dac +./.git/objects/7d +./.git/objects/7d/24c5c18d327f576162279ec5c8d1040ae9f53a +./.git/objects/d0 +./.git/objects/d0/bdcf3a2e68cf891369b37c0bdafe5b3a8c3721 +./.git/objects/ad +./.git/objects/ad/dc2932430682c6f4aebf981bbdb60bd8650a46 +./.git/objects/5e +./.git/objects/5e/5a6c6e318cbcc56ac17c785e42512984a54ffe +./.git/objects/5e/65dfb3785145e3359094f2da9a9308ca89f95a +./.git/objects/5e/e61c21b52ad286753d76024e686ae2188f3d9c +./.git/objects/5e/7211f1f557ceea2922e95c9353344a7d8011eb +./.git/objects/60 +./.git/objects/60/825f155344cfac612ddb9aa4056cee1a3758fa +./.git/objects/60/df4d354a7d252d34d31d266064a498bae0c00e +./.git/objects/60/9434044eb5c63cfbe323346a76c37316f5d3cb +./.git/objects/68 +./.git/objects/68/7f8ea68a53e8a91694db6e8e19309c7aaff242 +./.git/objects/68/5de33ed341bc2e51a650f5c42f02a4cea86c74 +./.git/objects/68/ba3add2e2f20ff74ca504aa69a067269748796 +./.git/objects/68/54eba89564df3120ae33bd3905f70af0a9508e +./.git/objects/bb +./.git/objects/bb/26307254872961f81864ae695a2212da53e305 +./.git/objects/bb/0b632d318e1cc55446ab86264a943dc662dedf +./.git/objects/bb/04232d06b8b1b51d0313c76ecdf53fbb38b133 +./.git/objects/79 +./.git/objects/79/05fb61e4f1057e53188709c091739c2adabdce +./.git/objects/3b +./.git/objects/3b/6969415623e49b583fd9623a21269d16cf67d8 +./.git/objects/3b/a8e873cae0cd0d76051c8845a05ee7e5d6d2d1 +./.git/objects/24 +./.git/objects/24/18f0e1209770aeeb848a83fab8a87a5e9a3a86 +./.git/objects/24/4d94f9094fe4b391a781a7b5059d19233558c0 +./.git/objects/1a +./.git/objects/1a/a9a109bfb5e0100583062628741d37edbef198 +./.git/objects/cf +./.git/objects/cf/b4058d5efe88c7e11216de67c3ddecfaeb74c1 +./.git/objects/cf/ca9f3ec8ffd3a716e84a1816fd2f290aff45f3 +./.git/objects/cf/249dc6bf90a05272ea34a37c179695ec1679e8 +./.git/objects/41 +./.git/objects/41/7e9c7be145d7ce950e2634e24bdb7ead9eaf27 +./.git/objects/e0 +./.git/objects/e0/cf46969ea651e5939b9d5f048eaec3ec1de4c2 +./.git/objects/e0/56cae16e516031cd82e2190a7a021a95145026 +./.git/objects/e0/897837a3548051e45ac861ab417adf7c59829f +./.git/objects/e0/d97cc471e24acd66068a44308c7abff19b1925 +./.git/objects/91 +./.git/objects/91/a4117c6026321105e7a694173a05b2644d77c2 +./.git/objects/91/b1cee3be1539bb24a359d20d8a5e076a9cf30d +./.git/objects/fa +./.git/objects/fa/eb7cd164e1c586e7d45a09e0619cf9a74f54af +./.git/objects/fa/8e7262e1a26131ac68a1a911baca583f27dca4 +./.git/objects/fa/9de6a7c9f2dee85a1145e1fc4c5d1b570e964b +./.git/objects/2b +./.git/objects/2b/4c9ad40ba8fde41829a22e88bd26d38cd513f9 +./.git/objects/54 +./.git/objects/54/f717fa1d49e761b89e8f8097e8e3afde5cc6f7 +./.git/objects/54/f513d93e0e3263265c24591a7c874b53030436 +./.git/objects/b4 +./.git/objects/b4/6ae8881ee452f7a4ff0efb75bac923f36d14aa +./.git/objects/b4/8ffdfbe33be1dfd1783c8ff6b1e8fbe41db10d +./.git/objects/b4/e29997072acf52d882cfced7e85bb3b37460b5 +./.git/objects/b4/25a61759c2334ae65b46e56533f54804906c7a +./.git/objects/ac +./.git/objects/ac/1468dfa7da0b2fea46f17a3525c19feacdc1a1 +./.git/objects/ac/375885c60a0ea3a99d8bb439109c1fa895761d +./.git/objects/ac/fe49dc5da96f42f02493db63c143e2e2babad1 +./.git/objects/ac/6c0f696722f17907f137f4c7fad91e64f49dac +./.git/objects/e9 +./.git/objects/e9/cc98781f3762e4dd7e8be3b025723674999a31 +./.git/objects/1e +./.git/objects/1e/08112d5532b7b52a6d51270e73b5bf019ebd0a +./.git/objects/6c +./.git/objects/6c/e4e9093f07b21a48bb122f10a347e38eac84a3 +./.git/objects/03 +./.git/objects/03/fb2ad72c7a8427949a2bd42155098c7ba5292b +./.git/objects/b5 +./.git/objects/b5/3290a08b842bb02ec8b9a453ddcce593595285 +./.git/objects/1f +./.git/objects/1f/b018c263111f5612dff4a0714d133fa14b47d0 +./.git/objects/f5 +./.git/objects/f5/5fa50d15939fef068814d393f10c47d6c24df0 +./.git/objects/78 +./.git/objects/78/c2eef20df32f85b780af9ea5cda026a0c597ec +./.git/objects/05 +./.git/objects/05/c94e09c13c59b6b0313c670a3b2b05f104e9d5 +./.git/objects/05/6aa0e5511235422238f4944ab543c1339ba472 +./.git/objects/05/2233649e450bb03b929975139eeb624310852e +./.git/objects/05/0ac4c6759899cd6c1c48d74aaa8cdeb8802d67 +./.git/objects/05/5c86781ebb8d7466e996d2a953ffb2701160b7 +./.git/objects/3a +./.git/objects/3a/b09d8cd602c91f6b13418aedfc10c6dea50311 +./.git/objects/3a/0080d9bd9df6701481c73c108df8f8aba270d6 +./.git/objects/3f +./.git/objects/3f/121aca11150765ad412e552c38cb846cdf1b25 +./.git/objects/9d +./.git/objects/9d/c192d47b5c60f58ea01b6fbb3cac7f95355920 +./.git/objects/dc +./.git/objects/dc/81451f732d8886a46ba4be0eb8689c3ffa05ff +./.git/objects/dc/277721b85d2c24888d9b295d73e77dc0b56c47 +./.git/objects/11 +./.git/objects/11/f3056a841ad601f309f9bb45fd1e7a45bdc35a +./.git/objects/09 +./.git/objects/09/83adb85f1e3af304b3219d112d87868940bd34 +./.git/objects/09/9d1ab30c98206fe2cd21e5351e4e739ddca093 +./.git/objects/a6 +./.git/objects/a6/78884c9d14735ad62e6ffe8edeb237b153b31f +./.git/objects/90 +./.git/objects/90/d315297656082d1d3fa42398237e6fd7ca8b77 +./.git/objects/90/131c7cf785f05180b94d3fa03ea5a15bc3fc86 +./.git/objects/9c +./.git/objects/9c/5bef9eb738539b7ef600afb5e4a4daf520ecf3 +./.git/objects/9c/5e872385933c0748afbe0d04721d2d87fcef7a +./.git/objects/9e +./.git/objects/9e/967e175d46284466def0287af70684002ebc9b +./.git/objects/98 +./.git/objects/98/2ac64fcb212641da96dd4aeba1c4d3a2089c8d +./.git/objects/83 +./.git/objects/83/c7ca0b3a62936df5bdadf81866f2f42a60201d +./.git/objects/ee +./.git/objects/ee/905700bdc10dd27f0b9b5d3e8e7c63cdb003a8 +./.git/objects/ee/137eb490291da49905c526776f3240dd994bde +./.git/objects/8b +./.git/objects/8b/b979b870b98d1421703d3c5f9b4506b34da31b +./.git/objects/8b/94333973de20dc2f9f6fba276d77957a585f19 +./.git/objects/8b/49a1a69ac34a2a9133ff3cfb2b30d7ce21269c +./.git/objects/cd +./.git/objects/cd/149e97dcb42e9893f15b0908cc6e7be8c176ad +./.git/objects/f3 +./.git/objects/f3/ff42162b9ceaa17323e15da2fcae4455acdc94 +./.git/objects/df +./.git/objects/df/0cad1cb3e9fa09265d3f63deedc382639dd902 +./.git/objects/74 +./.git/objects/74/3d2df4b1818f0aa743f57a85a4a5fb41b01fa9 +./.git/objects/74/01fc63e34bb3750d5b16c5d151653c8d6cfbc0 +./.git/objects/01 +./.git/objects/01/2e7dc0ac588f0e708ee8a8d115dc7c064c12b7 +./.git/objects/01/b09c53d179d687bc0837662bc48601e928502e +./.git/objects/01/7b3e99b3434bfe6ef2edb0a75b080311844169 +./.git/objects/dd +./.git/objects/dd/ad7182883890236f925f3353f4873c0b75034a +./.git/objects/18 +./.git/objects/18/8fe3d25861b14cd4af1163410b61f73ca4a04e +./.git/objects/62 +./.git/objects/62/064f59de8e93a5664bf43829e477bd9985cef6 +./.git/objects/2f +./.git/objects/2f/cd5c7dea3df5fa12fcbb5f26c8582b2910c447 +./.git/objects/b6 +./.git/objects/b6/8cc01c492b43223b504821bfd73b4f583ef4e2 +./.git/objects/be +./.git/objects/be/6b86d77fbb51906eacba3df56274612f1c3029 +./.git/objects/be/0938c18155c11f355d0fdc81c489c599cde3a7 +./.git/objects/be/d2ef86152fa9a0aabd84027e317733f0dfbae7 +./.git/objects/be/f110dd4cfaa19684396aaf93b27a5fb3c0b945 +./.git/objects/f6 +./.git/objects/f6/354b6caad8336c77feff4de8509c274a27fb70 +./.git/objects/21 +./.git/objects/21/5eb4b90109c24b08014bba985aa654feb57c59 +./.git/objects/f2 +./.git/objects/f2/687c378697490c59291d1cec912fe66852453a +./.git/objects/f2/d72115d8389718be364c28fc13785e5277b180 +./.git/objects/3d +./.git/objects/3d/a02568239608f630e802f4dd777449d0913e03 +./.git/objects/1d +./.git/objects/1d/2143cfb7d3376ecd5486c18c779c9d144d384c +./.git/objects/6f +./.git/objects/6f/5fb29affd8ac04ecf3574d5e99a716b414f2ca +./.git/objects/6f/6df7bced1281095533e756c0d6b9e7373f1a8b +./.git/objects/6f/52a7ca4ac7a38e8705299c2d9473ca1058b627 +./.git/objects/45 +./.git/objects/45/f68c1233212098478ac83525e9e68e2df1fadb +./.git/objects/45/6f776946d4828283ba374a165742234590fcc9 +./.git/objects/45/7aca0b40cef66eacf359630f145a4ff26364f2 +./.git/objects/5a +./.git/objects/5a/de07472965b3a18ce368b40dc9c904278142b9 +./.git/objects/info +./.git/objects/af +./.git/objects/af/f436ea457490f0752d934983bf1bbba2e2197a +./.git/objects/af/45c22cb2b534b5698562b3080e747fefa1b981 +./.git/objects/d8 +./.git/objects/d8/f83208765b69ade21aa13ffcdd38648bc54f14 +./.git/objects/70 +./.git/objects/70/e2c3dfaef29f8c8a3fa5d714105d3f5c3cd10f +./.git/objects/70/d74188dd90fb7e09d2d163dfa874e7c9e434bd +./.git/objects/61 +./.git/objects/61/a4c4b922c4ef41400d990031f3ac12f6a0888c +./.git/objects/61/6f06980312fc46031d9f21f29beef5e17d46f1 +./.git/objects/61/21078331176aaee1238c0f059f7f94663690b3 +./.git/objects/61/b65d28779937e8010a8c9d5f7eeeddbc5e0294 +./.git/objects/84 +./.git/objects/84/6ed3f02af5b46751dc2b0ff84690a84abf8f52 +./.git/objects/84/4378673c7db10ddc3e8d22a8187ebbc8905030 +./.git/objects/23 +./.git/objects/23/f56c7122758ebed362f40e98d4f6030659abb4 +./.git/objects/23/52ab57fcf2e8a26f2bf625baf5b5bbfb831417 +./.git/objects/23/b7b897cff7f35ae6566ce97c6c7c2590df15f2 +./.git/objects/23/cb216300bc481ca8915b935e563f4d409199e1 +./.git/objects/db +./.git/objects/db/06339ac988ad5c24e59b1e72038c9bc8dcc4d7 +./.git/objects/db/1f143eeac8eeb55f3ef826718e1d6f915cc5f8 +./.git/objects/f4 +./.git/objects/f4/427b4d63bf1508f56f365c65e85280d1af5763 +./.git/objects/f4/a591498b21a52c805a11509178fa5d0915f526 +./.git/objects/f4/0c870089b85b6402c0a2243f740a1370c79286 +./.git/objects/0e +./.git/objects/0e/189b2485f4535994863ca52a6420ec623f819d +./.git/objects/0e/e9b8159d8b6e430b82845f790cb54669e833f3 +./.git/objects/0e/e16c90707c76e59789d809ab5fc41fc91f5f67 +./.git/objects/0e/9d4bcf8f359d586197ff7e48bfb2662c0c0935 +./.git/objects/d2 +./.git/objects/d2/00c42d909a31c17dea48da932202860b406cf9 +./.git/objects/d2/b9537b66981e8063b93ac516098a9ab07a531b +./.git/objects/2d +./.git/objects/2d/45d0f37ab6828e8531fbf86096bc9d8e895527 +./.git/objects/2d/99122bb3539025961f0fda6ae664b7342b6b83 +./.git/objects/2d/1e8b892f7ee580e10244ba83ac6cc89e7ba727 +./.git/objects/2d/4b717bf8427170e5906538104c6d0fa4d630f0 +./.git/objects/82 +./.git/objects/82/56665942d2d563c29a09c8e8dd633014b4514b +./.git/objects/82/08d67320a0aff6533a4b7fac917ddc79f95418 +./.git/objects/82/69e1621ff5621db953bb1e5c26ccc9141eb8b3 +./.git/objects/0d +./.git/objects/0d/6a0b5fe6a1bafad0544e22510df8a62335a609 +./.git/objects/0d/36a5901973e607b333be7b03ba47eea66dc403 +./.git/objects/0d/42bacc3191f370f0f4bc276cd908061f9ac828 +./.git/objects/0d/707ecbb8abf68b3ebfb525b63937f9c805cef1 +./.git/objects/10 +./.git/objects/10/48a1a8ae5742d323ae648c92a777d1d2d2015b +./.git/objects/59 +./.git/objects/59/151f39243bb30f51e9dc7595311d93103d6eb4 +./.git/objects/63 +./.git/objects/63/9f63c06b29f39840edd73ad04cd3b8309204f0 +./.git/objects/63/2775a8aa713d331205fa1d8c167e84ad9186cb +./.git/objects/5b +./.git/objects/5b/52231d17726d4ad4ea7994fe105eb81e9e9dd9 +./.git/objects/14 +./.git/objects/14/36d8a33498d27fa02e0f2e3c8862475bfacebd +./.git/objects/17 +./.git/objects/17/662c8875023c93144ae394f2c76c8c3cda38a0 +./.git/objects/17/54196f5997eef681179aedae446349ea1b8f95 +./.git/objects/3e +./.git/objects/3e/997e9b6647df287339fcde654c42aa30505be4 +./.git/objects/3e/cc46e96a32867bbe05b2290fc126ab93f85bcd +./.git/objects/56 +./.git/objects/56/fd5d33c9c2bfaa8805dc058d7fde9152723a33 +./.git/objects/56/f541e80ea129e6e064f8b43cacacb70e17953e +./.git/objects/bc +./.git/objects/bc/1bf84bbb65652f4b787dd7b446860a32c2e8ca +./.git/objects/bc/82b6b7fbd83ed55e18d675745ba2efb6239f5b +./.git/objects/8f +./.git/objects/8f/7dd0e6b24f8f93777152241f7891017c613e4e +./.git/objects/8f/a44135f287836c9c362922897cdf50c8abfb0a +./.git/objects/40 +./.git/objects/40/953ed78adacbd5e220aad3cec4ac5e30b90bd8 +./.git/objects/40/17069d256c4b4b46ace0a1d3a17ab474705571 +./.git/objects/40/59ef7942ff3505c5ce73ce59f4a7bfd82ba6b9 +./.git/objects/40/3ce1a8cdab4310a2e1867567be6e847f71efd9 +./.git/objects/b2 +./.git/objects/b2/6158a9e38f0450e55de2bcff275ceb5322c6ad +./.git/objects/b2/c32bc79af68feae3d4a621976396efe0820798 +./.git/objects/9b +./.git/objects/9b/5e363c1cdd23d5c116b74644713ebc7fe73af4 +./.git/objects/9b/e2d15472a28c004e57f8e223ed6d0141689f60 +./.git/objects/29 +./.git/objects/29/bc4dcda2ef8a866b6c44a8a86f06e38e3a486a +./.git/objects/29/e47e432042221c28bcf000f50ad6a08a75dbed +./.git/objects/29/2386fb0d6dca1127e6c05c2ecdba6e4a93cea7 +./.git/objects/58 +./.git/objects/58/ab0cfc01268a2ffd914ee9e62f9af1da78d178 +./.git/objects/58/491745d53e90b2f018604a67a00e301c9dfdfd +./.git/objects/19 +./.git/objects/19/1e9420d874de0715def8962f446e3aae2e05c7 +./.git/objects/67 +./.git/objects/67/96afd7d35321930d2c306f64c3ed4d72dfe482 +./.git/objects/1c +./.git/objects/1c/9d91b14661081b506e4dee4daf78b1567e3323 +./.git/objects/1c/3c0f6bc181c8c3904ad239102c183bfc5816d4 +./.git/objects/4c +./.git/objects/4c/5a7864dd9e018485e98f98f84b706a5510ae30 +./.git/objects/92 +./.git/objects/92/36c995e938d2c12a9a49714b6258db02ae39c3 +./.git/objects/92/7360fe2e74518ab716f01252c732a7269a7631 +./.git/objects/92/d9903d9e8f6dc0216f8644be14fd733977dd79 +./.git/objects/39 +./.git/objects/39/7b5ce17fe342555c16de630d19ba1b4773fefd +./.git/objects/39/41aa48e3029a96c11c0f12f9e94043ec655815 +./.git/objects/39/f544d4c59fd91822f5189ac7c70f6066bef54b +./.git/objects/6e +./.git/objects/6e/4bee91d25e88a35f0be2fc7a14552627915afe +./.git/objects/6e/2fab67530969884622b05d64c3d049465b5efa +./.git/objects/d7 +./.git/objects/d7/775f3198f4dbbb284ac1d2bfbe0789f95bef26 +./.git/objects/d7/b4d0ca5ca71281fdc77b49820272ddea245958 +./.git/objects/4f +./.git/objects/4f/f6719cf0d54eddb41be1f095ceb9d272a92aae +./.git/objects/b0 +./.git/objects/b0/4c7a44f9ab42e495553aa332bc80e3685021b8 +./.git/objects/b0/4eb2c9a981a3a7459b0053d7b8436471066730 +./.git/objects/fd +./.git/objects/fd/eb0e5a19478cca5377cb47fe41426975109205 +./.git/objects/fd/cdf055082dc71e0f30f962578f214e6812631e +./.git/objects/fd/ff1ae9862d61a9e117e56b3548078193cbd078 +./.git/objects/08 +./.git/objects/08/2e3a2cb843392a72074b65718bc672f56835cd +./.git/objects/ec +./.git/objects/ec/5efce500afb6393d66da480b2b529a4f4f348d +./.git/objects/ec/6fc32ccaa7ee6a6f3e732578eb4c98817deb83 +./.git/objects/16 +./.git/objects/16/252d2718003ea39fa83a16a84db4fc616d73d8 +./.git/objects/16/86d481b6ef09f351775a4dc1ce272c304df1ec +./.git/objects/2a +./.git/objects/2a/6f80a64c07b516b56c681e9d6e75598b43b207 +./.git/objects/27 +./.git/objects/27/045100d894b6044d1ae548948bdfa8b2b74874 +./.git/objects/31 +./.git/objects/31/9b57b742825237cdbd9b368d9cce7127be9775 +./.git/objects/28 +./.git/objects/28/77fdeda6264e50d9a6ec4cafa505de0507cbac +./.git/objects/28/7c7c56177b382323e8fb3b41db8e02d6b8bd1c +./.git/objects/28/bfeae2edb4cd27c3323cc44ada958aea2ec165 +./.git/objects/64 +./.git/objects/64/dbea3ab74738f8ee05cf874665a7c7181c5649 +./.git/objects/9a +./.git/objects/9a/03b54f19d91702674f804179c0d86dc8f763f2 +./.git/objects/34 +./.git/objects/34/139b81408463cd1ce871a3e954bf062e1c9711 +./.git/objects/34/912ecae4fdb5c0e898af5f0c5e2350dcad4cf0 +./.git/objects/34/d776502fcdc7f0ccc2bd972e390133526b91a0 +./.git/objects/7e +./.git/objects/7e/8a190738a7c9f6c5a6a520f817c85e41d02b46 +./.git/objects/7e/b3436415d6bb988d8c0a533f4b7fc2f66491aa +./.git/objects/7f +./.git/objects/7f/423a93c268c328b5e161d986d4476e2cf4f571 +./.git/objects/7f/c7ab857ad56796463f01c4f1cfc89e378ca221 +./.git/objects/7f/10d6e0684b637206d6919b5956a1ac6ed37e49 +./.git/objects/13 +./.git/objects/13/4908aac9fe7db898b18a0a7e154072c1a3d7a2 +./.git/objects/f9 +./.git/objects/f9/85215f4f91d9cc3b848e56f78aa519ae49b5fa +./.git/objects/89 +./.git/objects/89/abfc5bdc924ff5551bfaff18de67741966b15d +./.git/objects/89/1c6f722bff534ac88c0dde8e7e63454d199296 +./.git/objects/89/735db4f10c18709ada2dc788a9743565f55388 +./.git/objects/89/a9e26eeb0a3ffd689ab5e98e43a21a1c0a87ae +./.git/objects/89/855435ff72be41295f28ee57c2bee30202de47 +./.git/objects/b1 +./.git/objects/b1/c385a303bc2dd3b850b41a1ae0410af2c5786a +./.git/objects/b1/eefe4521c5ef85f8b779add575aecf58393ce5 +./.git/objects/b1/bacc7a0b15fb01e09e1a59ee6e5bdfc2925b54 +./.git/objects/b1/ec612772c1c2249e6af3e7ba77585ee5c13c58 +./.git/objects/ff +./.git/objects/ff/3defee152ceda6082a9dc5040cd433c61efcf9 +./.git/objects/ff/f440d7c7f9aad902701b385561ddc8b77f8fe3 +./.git/objects/e5 +./.git/objects/e5/3214c9e3cad529dc0d4dfb61b48f478bfbbd4c +./.git/objects/e5/f4ff38c22af8c8dc51a7c94b4f1fb83b6558e5 +./.git/objects/e1 +./.git/objects/e1/865c1c5fd94104ed9bacb4cd53adfcd83a9697 +./.git/objects/7b +./.git/objects/7b/a0dbd949f60a1559d96d911cc33efc6939d01c +./.git/objects/7b/ea82ee2c4c1021b2e20ad5276414d1b4246c47 +./.git/objects/3c +./.git/objects/3c/63515ea22b572d39b224cc508908853084b123 +./.git/objects/8e +./.git/objects/8e/272b49dc5f3be82d469086ca9d3bed7253e70f +./.git/objects/8e/749d82d91ba0123f0bb4ebf646857049b8ef0c +./.git/objects/c9 +./.git/objects/c9/df108c929b2e5a312cd86c704c090013039e69 +./.git/objects/c9/e2566ac9d03be1f74c0ffb437998644dd60e02 +./.git/objects/ab +./.git/objects/ab/52adec5fd340b834de7014d1f3471728801848 +./.git/objects/ab/22631c8807eda2cb9fa48606aa58cbdb7dd1f4 +./.git/objects/ab/0db94c4fba3033a1993b146fcb7d4e6fad191f +./.git/objects/6b +./.git/objects/6b/c97371f4936bd31bf71f37d410f67879d208fd +./.git/objects/6b/91cabb14b820197c4a396e195d6cf8dcc18eb3 +./.git/objects/52 +./.git/objects/52/9076f7a1acb9707376f59e0e1248c348867e34 +./.git/objects/f1 +./.git/objects/f1/84063fe783cae53b303f169c9af5bca7b0f82c +./.git/objects/f1/6282f7f2786e1f9f1130343dc8c94b0b6dbf9a +./.git/objects/77 +./.git/objects/77/f0dc81400956ade5be5598af329b46c0fdeab9 +./.git/objects/77/f3677d9db06f94d516209acdfc19569736cc9a +./.git/objects/77/b205d10b78dc1cc7b318d8ad3ef866f0b88435 +./.git/objects/77/11a20542d388d93aa990522a45a432c2e5b61b +./.git/objects/00 +./.git/objects/00/deef1c92d3b244f7392a54e2e0da3fcf73cccd +./.git/objects/aa +./.git/objects/aa/55b7f297a28c5663cb8051c5f4e5431150e36a +./.git/objects/aa/056834a4ab2e1d476b371a91206c396affa82b +./.git/objects/de +./.git/objects/de/11623e687555a843d60ecc3f0c70f550e98e3d +./.git/objects/de/19613e554cca5fab317d3605f5b0ad8a8f5711 +./.git/objects/de/3effaea36556b96c74d86abf327517b289e3b7 +./.git/objects/55 +./.git/objects/55/04a585b2d05749a197cc648d4948585b002aeb +./.git/objects/55/5671ae38e662bc02b3e4f84af337732c725c41 +./.git/objects/55/ac449d0e9a26a294d47152e964564bebf0cbb7 +./.git/objects/55/1b2391f3cec5b8a7c6971bb6a4043f273eee8e +./.git/objects/69 +./.git/objects/69/6bcb9994deb32a6198dbe7683bc43e33fa8a9f +./.git/objects/d6 +./.git/objects/d6/c954feb40f1ee75fdcf00ca57f5c0d3d2e0f99 +./.git/objects/d6/c49da67ff56cbfc86661c5dfffb3b144ed5eb0 +./.git/objects/d6/811902b9ad236aad771ce776e1332736f7d88f +./.git/objects/2c +./.git/objects/2c/33e2d59e07e20924a9b25c6c737ad228ca9b0f +./.git/objects/2c/7cf5ed16ae8e4d7333984cbebaff7962b4eaef +./.git/objects/2c/6a5f8dd2ccd70f29683915413d5a7afb0c38e0 +./.git/objects/2c/ea4a57dae3d943c974d35343fd14f857e4a537 +./.git/objects/2c/90b9d946389975380bd8b96ab03a5c325ca355 +./.git/objects/fb +./.git/objects/fb/143505c55d0b75752d2077621e7f6a057dbe0a +./.git/objects/0b +./.git/objects/0b/e9761371a503b2c6d83d8f7846543666377bab +./.git/objects/95 +./.git/objects/95/26ddc2f80b69eb676be5908b863e221cc55311 +./.git/objects/95/6fa5bf5ff46970f3646bb9d3a354587ce7a758 +./.git/objects/95/d981e795f5c2beb635a3e70f3ddbfede70fbb8 +./.git/objects/b8 +./.git/objects/b8/b07af1a1bc02d22ca70807a2e4604ed94bca68 +./.git/objects/65 +./.git/objects/65/2752f82065b53bd9e241d3b1ee260dd0dc7787 +./.git/objects/65/88c2c23bcac12f9692c2cd06480cee422c1c8d +./.git/objects/65/29f3b232b42e48a7ef1f935c765e60f34ef170 +./.git/objects/65/b10e153969f46f3d80b866741c0151876da8e7 +./.git/objects/65/964392595e262f0571912b5bacd9c999b002c4 +./.git/objects/c7 +./.git/objects/c7/7f3f64c1e9bfca7efb514ba08af74c9fae2885 +./.git/objects/94 +./.git/objects/94/87dd0b25009cbd8debaf3cb2edc383790677a5 +./.git/objects/42 +./.git/objects/42/f4ec8e259667b739c89b30684fa10b1dc8ea3d +./.git/objects/53 +./.git/objects/53/27fba844b2bc564d0aa3f1e3eb930bac3fb032 +./.git/objects/53/5ed8fc72b4b1e307dd19e89267438c4423db3a +./.git/objects/35 +./.git/objects/35/deb323feeaf3ca7e9bc101029176e11734d199 +./.git/objects/35/2afae96809d801c4b45997ee4431e5bb680ede +./.git/objects/35/fd399b11365e08ffbe39585abd3be44321c406 +./.git/objects/35/d2c09301c92eab37e03723c505065e494e10d1 +./.git/objects/35/cc754c3415cc8aad653e8b09af109fe391711a +./.git/objects/a1 +./.git/objects/a1/6cb5eafaef7a2354723f9c54d400f8ee743818 +./.git/objects/37 +./.git/objects/37/581301a02ffae12602a136cad472ea76cbdb8d +./.git/objects/97 +./.git/objects/97/26766d221f3303009799db3e7ea3bec6ed2cfe +./.git/objects/97/b4363d1844eda0f9979ff4734d54a160ea6064 +./.git/objects/97/e8f520105d1f196900c45665086a68b569b7e8 +./.git/objects/ba +./.git/objects/ba/cd1431be4d953a1388bbb2685601f8b66b6710 +./.git/objects/e6 +./.git/objects/e6/f4ae3427fdc975b344464f9437fd6e783cbc2d +./.git/objects/e6/1fa3ced7bb7e13d8ee87f4eac3d1f00cb30ccb +./.git/objects/e2 +./.git/objects/e2/c6fe08e2c69707b9817a09375f099e1a93efc1 +./.git/objects/b7 +./.git/objects/b7/eab6d5e5b62a0a594a2b52998cecb5ec993800 +./.git/objects/b7/306f863c71df9b62eb25319df5aa04de67d1ed +./.git/objects/b7/755aefc6109b0581f359cfef5050110780bdc5 +./.git/objects/b7/6d3f36ede46cd4e3868631cd56d2f89ca420c0 +./.git/objects/72 +./.git/objects/72/f7b77e29f727976e5cef2bbf573ff3ed2be79f +./.git/objects/25 +./.git/objects/25/0e21592a948efb3b81f3d325d5e6a92f7b57bb +./.git/objects/25/d7ba5250934e6ccf724f22997447dd929a992c +./.git/objects/93 +./.git/objects/93/0be3835b8225191fc3e25bac75b46f59f1e4ec +./.git/objects/93/468fafb296570203c5ea7f53f3b0ca969b781b +./.git/objects/a7 +./.git/objects/a7/665a4d00775a6736b3bf9251fd402ed5214b0a +./.git/objects/a7/eb3f631f1391d84d787c36d5d81f1db6d64ff4 +./.git/objects/5f +./.git/objects/5f/2e350ec9cc6ed80e7ec93285d796b50ded711b +./.git/objects/6d +./.git/objects/6d/a9005dd1a094e5fdd61f98d55552c8ed25b758 +./.git/objects/6d/d60283ba8de90e73c86a0a1b3a0262faa83ca9 +./.git/objects/6d/0ea5ad50c394ef01a3691451aa546d2ad7aa3a +./.git/objects/4a +./.git/objects/4a/254c4789f8b2cfcfb690215dd3bf16774b060f +./.git/objects/4a/97cd73b86af93d38715655b2922602bf5effd6 +./.git/objects/4a/91acf8a657f209e77074dea27f43dece276dd9 +./.git/objects/12 +./.git/objects/12/cd765ea7be32a73ea2ec12268ccc7a978dd975 +./.git/objects/12/848fa310de1e10eb83f41ef8359b148f3ed74a +./.git/objects/12/0d81b2e9e89ed11a8c86fcaac0f19f44b5f4eb +./.git/objects/0c +./.git/objects/0c/caecffa9374685010200f29c869dce83ed02be +./.git/objects/fc +./.git/objects/fc/8440f85ef023184513b2b606ee6e0b40e5c6e9 +./.git/objects/fc/986c42168c88cefaefd8a8201d4455b4205672 +./.git/objects/fc/0781a4dc652c45cdf8122833f5f1638fe087ca +./.git/objects/0f +./.git/objects/0f/1f15d28a517a5dee0e6d81a1804c02380c26f3 +./.git/objects/f0 +./.git/objects/f0/0a2246063ebd8671cafe9983de72bbc6e45f4c +./.git/objects/96 +./.git/objects/96/e4b97449ae85fad317880fe76a35207c9d419a +./.git/objects/96/67c6e2e8bfa8504c9af78bcfc698418349a3d1 +./.git/objects/96/005d21f7fc36c9c9a8767c7b457e51730aa953 +./.git/objects/4b +./.git/objects/4b/7bd2c810157b639d753ee04bafa89fd31d7c9e +./.git/objects/4b/7646b2252c04719e5b897e28f35510415a5a3a +./.git/description +./.git/config +./.git/index +./.git/ORIG_HEAD +./.git/refs +./.git/refs/remotes +./.git/refs/remotes/drup +./.git/refs/remotes/drup/master +./.git/refs/remotes/drup/rgrinberg-pp +./.git/refs/remotes/drup/no_exception +./.git/refs/remotes/origin +./.git/refs/remotes/origin/pp +./.git/refs/remotes/origin/master +./.git/refs/remotes/origin/re_automata-module-refactor +./.git/refs/remotes/origin/format-group +./.git/refs/remotes/origin/re_str-infinite-loop-fix +./.git/refs/remotes/origin/re_str-fixes +./.git/refs/remotes/origin/prepare-1.6.1 +./.git/refs/remotes/origin/travis-403 +./.git/refs/remotes/origin/benchmarks +./.git/refs/remotes/origin/HEAD +./.git/refs/remotes/origin/re-cleanups +./.git/refs/remotes/origin/exec-no-group +./.git/refs/remotes/origin/prepare-1.6.0 +./.git/refs/remotes/origin/4.00-compat +./.git/refs/remotes/origin/labelize-match-str +./.git/refs/remotes/origin/status-lazy +./.git/refs/remotes/origin/dead-code-deriv +./.git/refs/remotes/gt +./.git/refs/remotes/gt/master +./.git/refs/heads +./.git/refs/heads/master +./.git/refs/heads/re_automata-module-refactor +./.git/refs/heads/re_cset-abstract +./.git/refs/heads/re_str-infinite-loop-fix +./.git/refs/heads/re_str-fixes +./.git/refs/heads/infinite-loop-cleanup +./.git/refs/heads/dev +./.git/refs/heads/re-cleanups +./.git/refs/heads/status-lazy +./.git/refs/heads/exec-no-group-with-bench +./.git/refs/heads/dead-code-deriv +./.git/refs/tags +./.git/refs/tags/1.3.0 +./.git/refs/tags/1.1.0 +./.git/refs/tags/1.2.2 +./.git/refs/tags/1.6.1 +./.git/refs/tags/1.2.0 +./.git/refs/tags/1.6.0 +./.git/refs/tags/1.3.2 +./.git/refs/tags/1.3.1 +./.git/refs/tags/1.2.1 +./.git/refs/tags/1.5.0 +./.git/refs/tags/1.0.0 +./.git/refs/tags/1.4.1 +./.git/HEAD +./.git/branches +./.git/info +./.git/info/exclude +./.git/FETCH_HEAD +./.git/logs +./.git/logs/refs +./.git/logs/refs/remotes +./.git/logs/refs/remotes/drup +./.git/logs/refs/remotes/drup/master +./.git/logs/refs/remotes/drup/rgrinberg-pp +./.git/logs/refs/remotes/drup/no_exception +./.git/logs/refs/remotes/origin +./.git/logs/refs/remotes/origin/pp +./.git/logs/refs/remotes/origin/master +./.git/logs/refs/remotes/origin/re_automata-module-refactor +./.git/logs/refs/remotes/origin/format-group +./.git/logs/refs/remotes/origin/re_str-infinite-loop-fix +./.git/logs/refs/remotes/origin/re_str-fixes +./.git/logs/refs/remotes/origin/prepare-1.6.1 +./.git/logs/refs/remotes/origin/travis-403 +./.git/logs/refs/remotes/origin/benchmarks +./.git/logs/refs/remotes/origin/HEAD +./.git/logs/refs/remotes/origin/re-cleanups +./.git/logs/refs/remotes/origin/exec-no-group +./.git/logs/refs/remotes/origin/prepare-1.6.0 +./.git/logs/refs/remotes/origin/4.00-compat +./.git/logs/refs/remotes/origin/labelize-match-str +./.git/logs/refs/remotes/origin/status-lazy +./.git/logs/refs/remotes/origin/dead-code-deriv +./.git/logs/refs/remotes/gt +./.git/logs/refs/remotes/gt/master +./.git/logs/refs/heads +./.git/logs/refs/heads/master +./.git/logs/refs/heads/re_automata-module-refactor +./.git/logs/refs/heads/re_cset-abstract +./.git/logs/refs/heads/re_str-infinite-loop-fix +./.git/logs/refs/heads/re_str-fixes +./.git/logs/refs/heads/infinite-loop-cleanup +./.git/logs/refs/heads/dev +./.git/logs/refs/heads/re-cleanups +./.git/logs/refs/heads/status-lazy +./.git/logs/refs/heads/exec-no-group-with-bench +./.git/logs/refs/heads/dead-code-deriv +./.git/logs/HEAD +./.git/hooks +./.git/hooks/prepare-commit-msg.sample +./.git/hooks/pre-commit.sample +./.git/hooks/update.sample +./.git/hooks/pre-rebase.sample +./.git/hooks/pre-push.sample +./.git/hooks/pre-applypatch.sample +./.git/hooks/applypatch-msg.sample +./.git/hooks/post-update.sample +./.git/hooks/commit-msg.sample +./.git/packed-refs +./_build +./_build/oUnit-test_perl-rgcaml#02.log +./_build/oUnit-test_str.cache +./_build/lib +./_build/lib/re.a +./_build/lib/re_emacs.o +./_build/lib/re_glob.mllib +./_build/lib/re_posix.cmi +./_build/lib/re_str.cmi +./_build/lib/re_perl.cmx +./_build/lib/re_glob.cmi +./_build/lib/re.mli +./_build/lib/re_pcre.annot +./_build/lib/re_emacs.annot +./_build/lib/re_cset.ml +./_build/lib/re_fmt.o +./_build/lib/re_perl.mli.depends +./_build/lib/re_posix.cma +./_build/lib/re_perl.cmi +./_build/lib/re_glob.o +./_build/lib/re_str.cmti +./_build/lib/re_automata.ml +./_build/lib/re_str.mli.depends +./_build/lib/re.ml +./_build/lib/re_emacs.mli +./_build/lib/re_pcre.cmx +./_build/lib/re_perl.cmt +./_build/lib/re_posix.cmti +./_build/lib/re_glob.mldylib +./_build/lib/re_pcre.mli +./_build/lib/re.cmti +./_build/lib/re_emacs.mli.depends +./_build/lib/re_perl.a +./_build/lib/re_pcre.cmxa +./_build/lib/re_str.cmxa +./_build/lib/re_cset.ml.depends +./_build/lib/re_glob.cmxs +./_build/lib/re_pcre.a +./_build/lib/re_perl.mldylib +./_build/lib/re.cmxa +./_build/lib/re_pcre.cmo +./_build/lib/re_pcre.cmt +./_build/lib/re_str.annot +./_build/lib/re_posix.mli +./_build/lib/re_perl.mli +./_build/lib/re_cset.cmt +./_build/lib/re.mli.depends +./_build/lib/re.mldylib +./_build/lib/re_str.cmx +./_build/lib/re_pcre.o +./_build/lib/re_automata.cmo +./_build/lib/re_posix.mldylib +./_build/lib/re_automata.mli +./_build/lib/re_cset.annot +./_build/lib/re_fmt.cmt +./_build/lib/re_posix.a +./_build/lib/re_glob.ml +./_build/lib/re.cmxs +./_build/lib/re.cmi +./_build/lib/re_str.cmt +./_build/lib/re_pcre.cmxs +./_build/lib/re_pcre.mli.depends +./_build/lib/re_emacs.ml +./_build/lib/re_fmt.cmi +./_build/lib/re_perl.ml.depends +./_build/lib/re_emacs.cmxa +./_build/lib/re_pcre.mllib +./_build/lib/re.o +./_build/lib/re_pcre.cmi +./_build/lib/re_emacs.cmo +./_build/lib/re_glob.cma +./_build/lib/re_str.a +./_build/lib/re.cmt +./_build/lib/re_str.o +./_build/lib/re_cset.cmti +./_build/lib/re_posix.annot +./_build/lib/re.ml.depends +./_build/lib/re_posix.cmo +./_build/lib/re.annot +./_build/lib/re_automata.cmx +./_build/lib/re_perl.cma +./_build/lib/re_posix.cmx +./_build/lib/re_perl.annot +./_build/lib/re_glob.cmxa +./_build/lib/re_glob.mli +./_build/lib/re_cset.cmo +./_build/lib/re_cset.cmx +./_build/lib/re_glob.a +./_build/lib/re_cset.cmi +./_build/lib/re_str.mldylib +./_build/lib/re_pcre.cmti +./_build/lib/re_automata.mli.depends +./_build/lib/re.mllib +./_build/lib/re_posix.mllib +./_build/lib/re_emacs.cmxs +./_build/lib/re_fmt.annot +./_build/lib/re_emacs.cmt +./_build/lib/re_fmt.cmo +./_build/lib/re_pcre.ml +./_build/lib/re_glob.cmti +./_build/lib/re_glob.cmx +./_build/lib/re_posix.mli.depends +./_build/lib/re_emacs.cmti +./_build/lib/re_str.ml.depends +./_build/lib/re_automata.cmi +./_build/lib/re_emacs.mldylib +./_build/lib/re_pcre.cma +./_build/lib/re_glob.cmt +./_build/lib/re_pcre.ml.depends +./_build/lib/re_posix.cmxs +./_build/lib/re_automata.annot +./_build/lib/re_emacs.a +./_build/lib/re_posix.cmxa +./_build/lib/re_pcre.mldylib +./_build/lib/re_cset.mli +./_build/lib/re_str.cmxs +./_build/lib/re_posix.o +./_build/lib/re_emacs.mllib +./_build/lib/re_perl.cmo +./_build/lib/re_cset.mli.depends +./_build/lib/re_str.mli +./_build/lib/re_glob.ml.depends +./_build/lib/re_posix.cmt +./_build/lib/re_perl.o +./_build/lib/re_str.cma +./_build/lib/re.cma +./_build/lib/re_emacs.cmx +./_build/lib/re_fmt.cmx +./_build/lib/re_str.cmo +./_build/lib/re_glob.annot +./_build/lib/re_perl.cmti +./_build/lib/re_perl.ml +./_build/lib/re_perl.cmxs +./_build/lib/re.cmo +./_build/lib/re_fmt.ml.depends +./_build/lib/re_posix.ml.depends +./_build/lib/re_automata.ml.depends +./_build/lib/re_fmt.ml +./_build/lib/re_perl.mllib +./_build/lib/re_glob.mli.depends +./_build/lib/re_emacs.cmi +./_build/lib/re.cmx +./_build/lib/re_emacs.ml.depends +./_build/lib/re_automata.o +./_build/lib/re_posix.ml +./_build/lib/re_perl.cmxa +./_build/lib/re_str.ml +./_build/lib/re_cset.o +./_build/lib/re_glob.cmo +./_build/lib/re_str.mllib +./_build/lib/re_emacs.cma +./_build/lib/re_automata.cmt +./_build/lib/re_automata.cmti +./_build/oUnit-test_perl-rgcaml#01.log +./_build/lib_test +./_build/lib_test/test_glob.ml.depends +./_build/lib_test/test_emacs.cmo +./_build/lib_test/test_glob.cmt +./_build/lib_test/test_easy.ml +./_build/lib_test/test_str.cmi +./_build/lib_test/re_match.native +./_build/lib_test/test_emacs.ml +./_build/lib_test/test_glob.cmo +./_build/lib_test/test_pcre.cmt +./_build/lib_test/re_match.ml +./_build/lib_test/re_match.cmo +./_build/lib_test/test_pcre.cmi +./_build/lib_test/fort_unit.cmxa +./_build/lib_test/fort_unit.cmo +./_build/lib_test/re_match.o +./_build/lib_test/test_pcre.cmx +./_build/lib_test/re_match.ml.depends +./_build/lib_test/fort_unit.cmt +./_build/lib_test/test_glob.ml +./_build/lib_test/test_str.o +./_build/lib_test/test_perl.native +./_build/lib_test/re_match.cmx +./_build/lib_test/fort_unit.mllib +./_build/lib_test/test_perl.ml +./_build/lib_test/test_perl.annot +./_build/lib_test/fort_unit.ml +./_build/lib_test/test_re.native +./_build/lib_test/test_perl.cmt +./_build/lib_test/test_emacs.o +./_build/lib_test/test_easy.cmi +./_build/lib_test/test_pcre.o +./_build/lib_test/test_emacs.ml.depends +./_build/lib_test/test_emacs.cmx +./_build/lib_test/test_str.cmo +./_build/lib_test/test_perl.cmx +./_build/lib_test/fort_unit.mldylib +./_build/lib_test/re_match.cmt +./_build/lib_test/test_pcre.cmo +./_build/lib_test/test_easy.cmo +./_build/lib_test/fort_unit.cma +./_build/lib_test/test_str.cmx +./_build/lib_test/test_easy.cmx +./_build/lib_test/test_pcre.native +./_build/lib_test/test_emacs.cmt +./_build/lib_test/fort_unit.o +./_build/lib_test/fort_unit.cmx +./_build/lib_test/test_re.cmo +./_build/lib_test/test_glob.cmi +./_build/lib_test/fort_unit.a +./_build/lib_test/test_easy.annot +./_build/lib_test/re_match.annot +./_build/lib_test/test_emacs.native +./_build/lib_test/test_emacs.annot +./_build/lib_test/test_re.cmx +./_build/lib_test/fort_unit.annot +./_build/lib_test/test_re.cmi +./_build/lib_test/test_emacs.cmi +./_build/lib_test/test_perl.cmo +./_build/lib_test/test_re.annot +./_build/lib_test/test_easy.native +./_build/lib_test/test_pcre.annot +./_build/lib_test/test_glob.native +./_build/lib_test/test_glob.cmx +./_build/lib_test/test_str.cmt +./_build/lib_test/test_re.cmt +./_build/lib_test/test_re.ml.depends +./_build/lib_test/test_pcre.ml +./_build/lib_test/test_glob.o +./_build/lib_test/fort_unit.ml.depends +./_build/lib_test/test_easy.o +./_build/lib_test/test_perl.cmi +./_build/lib_test/fort_unit.cmi +./_build/lib_test/test_pcre.ml.depends +./_build/lib_test/test_perl.ml.depends +./_build/lib_test/re_match.cmi +./_build/lib_test/test_str.ml.depends +./_build/lib_test/fort_unit.cmxs +./_build/lib_test/test_re.o +./_build/lib_test/test_str.annot +./_build/lib_test/test_easy.ml.depends +./_build/lib_test/test_str.ml +./_build/lib_test/test_str.native +./_build/lib_test/test_re.ml +./_build/lib_test/test_glob.annot +./_build/lib_test/test_easy.cmt +./_build/lib_test/test_perl.o +./_build/_digests +./_build/oUnit-test_emacs-rgcaml#02.log +./_build/benchmarks +./_build/benchmarks/benchmark.cmt +./_build/benchmarks/benchmark.cmx +./_build/benchmarks/benchmark.ml +./_build/benchmarks/benchmark.cmi +./_build/benchmarks/benchmark.cmo +./_build/benchmarks/benchmark.ml.depends +./_build/benchmarks/benchmark.native +./_build/benchmarks/benchmark.annot +./_build/benchmarks/benchmark.o +./_build/myocamlbuild.cmi +./_build/oUnit-test_re.cache +./_build/oUnit-test_re-rgcaml#02.log +./_build/myocamlbuild.cmx +./_build/_log +./_build/ocamlc.where +./_build/oUnit-test_emacs-rgcaml#00.log +./_build/oUnit-test_emacs-rgcaml#01.log +./_build/oUnit-test_perl-rgcaml#00.log +./_build/oUnit-test_perl.cache +./_build/oUnit-test_str-rgcaml#01.log +./_build/oUnit-test_str-rgcaml#00.log +./_build/oUnit-test_re-rgcaml#00.log +./_build/oUnit-anon.cache +./_build/myocamlbuild.o +./_build/oUnit-test_emacs.cache +./_build/oUnit-test_str-rgcaml#02.log +./_build/myocamlbuild +./_build/oUnit-test_re-rgcaml#01.log +./_build/myocamlbuild.ml +./setup.log +./.travis.yml +./_tags +./test_pcre.native +./LICENSE +./test_emacs.native +./README.md +./TODO.txt +./test_easy.native +./benchmark.native +./test_glob.native +./tags +./dev.org +./INSTALL +./configure +./Makefile +./CHANGES +./opam +./test_str.native +./myocamlbuild.ml +./compare.pl diff --git a/benchmarks/unicode/http-requests.txt b/benchmarks/unicode/http-requests.txt new file mode 100644 index 00000000..f017911a --- /dev/null +++ b/benchmarks/unicode/http-requests.txt @@ -0,0 +1,494 @@ +GET / HTTP/1.1 +Host: www.reddit.com +User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:15.0) Gecko/20100101 Firefox/15.0.1 +Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8 +Accept-Language: en-us,en;q=0.5 +Accept-Encoding: gzip, deflate +Connection: keep-alive + +GET /reddit.v_EZwRzV-Ns.css HTTP/1.1 +Host: www.redditstatic.com +User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:15.0) Gecko/20100101 Firefox/15.0.1 +Accept: text/css,*/*;q=0.1 +Accept-Language: en-us,en;q=0.5 +Accept-Encoding: gzip, deflate +Connection: keep-alive +Referer: http://www.reddit.com/ + +GET /reddit-init.en-us.O1zuMqOOQvY.js HTTP/1.1 +Host: www.redditstatic.com +User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:15.0) Gecko/20100101 Firefox/15.0.1 +Accept: */* +Accept-Language: en-us,en;q=0.5 +Accept-Encoding: gzip, deflate +Connection: keep-alive +Referer: http://www.reddit.com/ + +GET /reddit.en-us.31yAfSoTsfo.js HTTP/1.1 +Host: www.redditstatic.com +User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:15.0) Gecko/20100101 Firefox/15.0.1 +Accept: */* +Accept-Language: en-us,en;q=0.5 +Accept-Encoding: gzip, deflate +Connection: keep-alive +Referer: http://www.reddit.com/ + +GET /kill.png HTTP/1.1 +Host: www.redditstatic.com +User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:15.0) Gecko/20100101 Firefox/15.0.1 +Accept: image/png,image/*;q=0.8,*/*;q=0.5 +Accept-Language: en-us,en;q=0.5 +Accept-Encoding: gzip, deflate +Connection: keep-alive +Referer: http://www.reddit.com/ + +GET /icon.png HTTP/1.1 +Host: www.redditstatic.com +User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:15.0) Gecko/20100101 Firefox/15.0.1 +Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8 +Accept-Language: en-us,en;q=0.5 +Accept-Encoding: gzip, deflate +Connection: keep-alive + +GET /favicon.ico HTTP/1.1 +Host: www.redditstatic.com +User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:15.0) Gecko/20100101 Firefox/15.0.1 +Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8 +Accept-Language: en-us,en;q=0.5 +Accept-Encoding: gzip, deflate +Connection: keep-alive + +GET /AMZM4CWd6zstSC8y.jpg HTTP/1.1 +Host: b.thumbs.redditmedia.com +User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:15.0) Gecko/20100101 Firefox/15.0.1 +Accept: image/png,image/*;q=0.8,*/*;q=0.5 +Accept-Language: en-us,en;q=0.5 +Accept-Encoding: gzip, deflate +Connection: keep-alive +Referer: http://www.reddit.com/ + +GET /jz1d5Nm0w97-YyNm.jpg HTTP/1.1 +Host: b.thumbs.redditmedia.com +User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:15.0) Gecko/20100101 Firefox/15.0.1 +Accept: image/png,image/*;q=0.8,*/*;q=0.5 +Accept-Language: en-us,en;q=0.5 +Accept-Encoding: gzip, deflate +Connection: keep-alive +Referer: http://www.reddit.com/ + +GET /aWGO99I6yOcNUKXB.jpg HTTP/1.1 +Host: a.thumbs.redditmedia.com +User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:15.0) Gecko/20100101 Firefox/15.0.1 +Accept: image/png,image/*;q=0.8,*/*;q=0.5 +Accept-Language: en-us,en;q=0.5 +Accept-Encoding: gzip, deflate +Connection: keep-alive +Referer: http://www.reddit.com/ + +GET /rZ_rD5TjrJM0E9Aj.css HTTP/1.1 +Host: e.thumbs.redditmedia.com +User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:15.0) Gecko/20100101 Firefox/15.0.1 +Accept: text/css,*/*;q=0.1 +Accept-Language: en-us,en;q=0.5 +Accept-Encoding: gzip, deflate +Connection: keep-alive +Referer: http://www.reddit.com/ + +GET /tmsPwagFzyTvrGRx.jpg HTTP/1.1 +Host: a.thumbs.redditmedia.com +User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:15.0) Gecko/20100101 Firefox/15.0.1 +Accept: image/png,image/*;q=0.8,*/*;q=0.5 +Accept-Language: en-us,en;q=0.5 +Accept-Encoding: gzip, deflate +Connection: keep-alive +Referer: http://www.reddit.com/ + +GET /KYgUaLvXCK3TCEJx.jpg HTTP/1.1 +Host: a.thumbs.redditmedia.com +User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:15.0) Gecko/20100101 Firefox/15.0.1 +Accept: image/png,image/*;q=0.8,*/*;q=0.5 +Accept-Language: en-us,en;q=0.5 +Accept-Encoding: gzip, deflate +Connection: keep-alive +Referer: http://www.reddit.com/ + +GET /81pzxT5x2ozuEaxX.jpg HTTP/1.1 +Host: e.thumbs.redditmedia.com +User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:15.0) Gecko/20100101 Firefox/15.0.1 +Accept: image/png,image/*;q=0.8,*/*;q=0.5 +Accept-Language: en-us,en;q=0.5 +Accept-Encoding: gzip, deflate +Connection: keep-alive +Referer: http://www.reddit.com/ + +GET /MFqCUiUVPO5V8t6x.jpg HTTP/1.1 +Host: a.thumbs.redditmedia.com +User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:15.0) Gecko/20100101 Firefox/15.0.1 +Accept: image/png,image/*;q=0.8,*/*;q=0.5 +Accept-Language: en-us,en;q=0.5 +Accept-Encoding: gzip, deflate +Connection: keep-alive +Referer: http://www.reddit.com/ + +GET /TFpYTiAO5aEowokv.jpg HTTP/1.1 +Host: e.thumbs.redditmedia.com +User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:15.0) Gecko/20100101 Firefox/15.0.1 +Accept: image/png,image/*;q=0.8,*/*;q=0.5 +Accept-Language: en-us,en;q=0.5 +Accept-Encoding: gzip, deflate +Connection: keep-alive +Referer: http://www.reddit.com/ + +GET /eMWMpmm9APNeNqcF.jpg HTTP/1.1 +Host: e.thumbs.redditmedia.com +User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:15.0) Gecko/20100101 Firefox/15.0.1 +Accept: image/png,image/*;q=0.8,*/*;q=0.5 +Accept-Language: en-us,en;q=0.5 +Accept-Encoding: gzip, deflate +Connection: keep-alive +Referer: http://www.reddit.com/ + +GET /S-IpsJrOKuaK9GZ8.jpg HTTP/1.1 +Host: c.thumbs.redditmedia.com +User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:15.0) Gecko/20100101 Firefox/15.0.1 +Accept: image/png,image/*;q=0.8,*/*;q=0.5 +Accept-Language: en-us,en;q=0.5 +Accept-Encoding: gzip, deflate +Connection: keep-alive +Referer: http://www.reddit.com/ + +GET /3V6dj9PDsNnheDXn.jpg HTTP/1.1 +Host: c.thumbs.redditmedia.com +User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:15.0) Gecko/20100101 Firefox/15.0.1 +Accept: image/png,image/*;q=0.8,*/*;q=0.5 +Accept-Language: en-us,en;q=0.5 +Accept-Encoding: gzip, deflate +Connection: keep-alive +Referer: http://www.reddit.com/ + +GET /wQ3-VmNXhv8sg4SJ.jpg HTTP/1.1 +Host: c.thumbs.redditmedia.com +User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:15.0) Gecko/20100101 Firefox/15.0.1 +Accept: image/png,image/*;q=0.8,*/*;q=0.5 +Accept-Language: en-us,en;q=0.5 +Accept-Encoding: gzip, deflate +Connection: keep-alive +Referer: http://www.reddit.com/ + +GET /ixd1C1njpczEWC22.jpg HTTP/1.1 +Host: c.thumbs.redditmedia.com +User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:15.0) Gecko/20100101 Firefox/15.0.1 +Accept: image/png,image/*;q=0.8,*/*;q=0.5 +Accept-Language: en-us,en;q=0.5 +Accept-Encoding: gzip, deflate +Connection: keep-alive +Referer: http://www.reddit.com/ + +GET /nGsQj15VyOHMwmq8.jpg HTTP/1.1 +Host: c.thumbs.redditmedia.com +User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:15.0) Gecko/20100101 Firefox/15.0.1 +Accept: image/png,image/*;q=0.8,*/*;q=0.5 +Accept-Language: en-us,en;q=0.5 +Accept-Encoding: gzip, deflate +Connection: keep-alive +Referer: http://www.reddit.com/ + +GET /zT4yQmDxQLbIxK1b.jpg HTTP/1.1 +Host: c.thumbs.redditmedia.com +User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:15.0) Gecko/20100101 Firefox/15.0.1 +Accept: image/png,image/*;q=0.8,*/*;q=0.5 +Accept-Language: en-us,en;q=0.5 +Accept-Encoding: gzip, deflate +Connection: keep-alive +Referer: http://www.reddit.com/ + +GET /L5e1HcZLv1iu4nrG.jpg HTTP/1.1 +Host: f.thumbs.redditmedia.com +User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:15.0) Gecko/20100101 Firefox/15.0.1 +Accept: image/png,image/*;q=0.8,*/*;q=0.5 +Accept-Language: en-us,en;q=0.5 +Accept-Encoding: gzip, deflate +Connection: keep-alive +Referer: http://www.reddit.com/ + +GET /WJFFPxD8X4JO_lIG.jpg HTTP/1.1 +Host: f.thumbs.redditmedia.com +User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:15.0) Gecko/20100101 Firefox/15.0.1 +Accept: image/png,image/*;q=0.8,*/*;q=0.5 +Accept-Language: en-us,en;q=0.5 +Accept-Encoding: gzip, deflate +Connection: keep-alive +Referer: http://www.reddit.com/ + +GET /hVMVTDdjuY3bQox5.jpg HTTP/1.1 +Host: f.thumbs.redditmedia.com +User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:15.0) Gecko/20100101 Firefox/15.0.1 +Accept: image/png,image/*;q=0.8,*/*;q=0.5 +Accept-Language: en-us,en;q=0.5 +Accept-Encoding: gzip, deflate +Connection: keep-alive +Referer: http://www.reddit.com/ + +GET /rnWf8CjBcyPQs5y_.jpg HTTP/1.1 +Host: f.thumbs.redditmedia.com +User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:15.0) Gecko/20100101 Firefox/15.0.1 +Accept: image/png,image/*;q=0.8,*/*;q=0.5 +Accept-Language: en-us,en;q=0.5 +Accept-Encoding: gzip, deflate +Connection: keep-alive +Referer: http://www.reddit.com/ + +GET /gZJL1jNylKbGV4d-.jpg HTTP/1.1 +Host: d.thumbs.redditmedia.com +User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:15.0) Gecko/20100101 Firefox/15.0.1 +Accept: image/png,image/*;q=0.8,*/*;q=0.5 +Accept-Language: en-us,en;q=0.5 +Accept-Encoding: gzip, deflate +Connection: keep-alive +Referer: http://www.reddit.com/ + +GET /aNd2zNRLXiMnKUFh.jpg HTTP/1.1 +Host: c.thumbs.redditmedia.com +User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:15.0) Gecko/20100101 Firefox/15.0.1 +Accept: image/png,image/*;q=0.8,*/*;q=0.5 +Accept-Language: en-us,en;q=0.5 +Accept-Encoding: gzip, deflate +Connection: keep-alive +Referer: http://www.reddit.com/ + +GET /droparrowgray.gif HTTP/1.1 +Host: www.redditstatic.com +User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:15.0) Gecko/20100101 Firefox/15.0.1 +Accept: image/png,image/*;q=0.8,*/*;q=0.5 +Accept-Language: en-us,en;q=0.5 +Accept-Encoding: gzip, deflate +Connection: keep-alive +Referer: http://www.redditstatic.com/reddit.v_EZwRzV-Ns.css + +GET /sprite-reddit.an0Lnf61Ap4.png HTTP/1.1 +Host: www.redditstatic.com +User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:15.0) Gecko/20100101 Firefox/15.0.1 +Accept: image/png,image/*;q=0.8,*/*;q=0.5 +Accept-Language: en-us,en;q=0.5 +Accept-Encoding: gzip, deflate +Connection: keep-alive +Referer: http://www.redditstatic.com/reddit.v_EZwRzV-Ns.css + +GET /ga.js HTTP/1.1 +Host: www.google-analytics.com +User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:15.0) Gecko/20100101 Firefox/15.0.1 +Accept: */* +Accept-Language: en-us,en;q=0.5 +Accept-Encoding: gzip, deflate +Connection: keep-alive +Referer: http://www.reddit.com/ +If-Modified-Since: Tue, 29 Oct 2013 19:33:51 GMT + +GET /reddit/ads.html?sr=-reddit.com&bust2 HTTP/1.1 +Host: static.adzerk.net +User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:15.0) Gecko/20100101 Firefox/15.0.1 +Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8 +Accept-Language: en-us,en;q=0.5 +Accept-Encoding: gzip, deflate +Connection: keep-alive +Referer: http://www.reddit.com/ + +GET /pixel/of_destiny.png?v=hOlmDALJCWWdjzfBV4ZxJPmrdCLWB%2Ftq7Z%2Ffp4Q%2FxXbVPPREuMJMVGzKraTuhhNWxCCwi6yFEZg%3D&r=783333388 HTTP/1.1 +Host: pixel.redditmedia.com +User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:15.0) Gecko/20100101 Firefox/15.0.1 +Accept: image/png,image/*;q=0.8,*/*;q=0.5 +Accept-Language: en-us,en;q=0.5 +Accept-Encoding: gzip, deflate +Connection: keep-alive +Referer: http://www.reddit.com/ + +GET /UNcO-h_QcS9PD-Gn.jpg HTTP/1.1 +Host: c.thumbs.redditmedia.com +User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:15.0) Gecko/20100101 Firefox/15.0.1 +Accept: image/png,image/*;q=0.8,*/*;q=0.5 +Accept-Language: en-us,en;q=0.5 +Accept-Encoding: gzip, deflate +Connection: keep-alive +Referer: http://e.thumbs.redditmedia.com/rZ_rD5TjrJM0E9Aj.css + +GET /welcome-lines.png HTTP/1.1 +Host: www.redditstatic.com +User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:15.0) Gecko/20100101 Firefox/15.0.1 +Accept: image/png,image/*;q=0.8,*/*;q=0.5 +Accept-Language: en-us,en;q=0.5 +Accept-Encoding: gzip, deflate +Connection: keep-alive +Referer: http://www.redditstatic.com/reddit.v_EZwRzV-Ns.css + +GET /welcome-upvote.png HTTP/1.1 +Host: www.redditstatic.com +User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:15.0) Gecko/20100101 Firefox/15.0.1 +Accept: image/png,image/*;q=0.8,*/*;q=0.5 +Accept-Language: en-us,en;q=0.5 +Accept-Encoding: gzip, deflate +Connection: keep-alive +Referer: http://www.redditstatic.com/reddit.v_EZwRzV-Ns.css + +GET /__utm.gif?utmwv=5.5.1&utms=1&utmn=720496082&utmhn=www.reddit.com&utme=8(site*srpath*usertype*uitype)9(%20reddit.com*%20reddit.com-GET_listing*guest*web)11(3!2)&utmcs=UTF-8&utmsr=2560x1600&utmvp=1288x792&utmsc=24-bit&utmul=en-us&utmje=1&utmfl=13.0%20r0&utmdt=reddit%3A%20the%20front%20page%20of%20the%20internet&utmhid=2129416330&utmr=-&utmp=%2F&utmht=1400862512705&utmac=UA-12131688-1&utmcc=__utma%3D55650728.585571751.1400862513.1400862513.1400862513.1%3B%2B__utmz%3D55650728.1400862513.1.1.utmcsr%3D(direct)%7Cutmccn%3D(direct)%7Cutmcmd%3D(none)%3B&utmu=qR~ HTTP/1.1 +Host: www.google-analytics.com +User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:15.0) Gecko/20100101 Firefox/15.0.1 +Accept: image/png,image/*;q=0.8,*/*;q=0.5 +Accept-Language: en-us,en;q=0.5 +Accept-Encoding: gzip, deflate +Connection: keep-alive +Referer: http://www.reddit.com/ + +GET /ImnpOQhbXUPkwceN.png HTTP/1.1 +Host: a.thumbs.redditmedia.com +User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:15.0) Gecko/20100101 Firefox/15.0.1 +Accept: image/png,image/*;q=0.8,*/*;q=0.5 +Accept-Language: en-us,en;q=0.5 +Accept-Encoding: gzip, deflate +Connection: keep-alive +Referer: http://www.reddit.com/ + +GET /ajax/libs/jquery/1.7.1/jquery.min.js HTTP/1.1 +Host: ajax.googleapis.com +User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:15.0) Gecko/20100101 Firefox/15.0.1 +Accept: */* +Accept-Language: en-us,en;q=0.5 +Accept-Encoding: gzip, deflate +Connection: keep-alive +Referer: http://static.adzerk.net/reddit/ads.html?sr=-reddit.com&bust2 + +GET /__utm.gif?utmwv=5.5.1&utms=2&utmn=1493472678&utmhn=www.reddit.com&utmt=event&utme=5(AdBlock*enabled*false)(0)8(site*srpath*usertype*uitype)9(%20reddit.com*%20reddit.com-GET_listing*guest*web)11(3!2)&utmcs=UTF-8&utmsr=2560x1600&utmvp=1288x792&utmsc=24-bit&utmul=en-us&utmje=1&utmfl=13.0%20r0&utmdt=reddit%3A%20the%20front%20page%20of%20the%20internet&utmhid=2129416330&utmr=-&utmp=%2F&utmht=1400862512708&utmac=UA-12131688-1&utmni=1&utmcc=__utma%3D55650728.585571751.1400862513.1400862513.1400862513.1%3B%2B__utmz%3D55650728.1400862513.1.1.utmcsr%3D(direct)%7Cutmccn%3D(direct)%7Cutmcmd%3D(none)%3B&utmu=6R~ HTTP/1.1 +Host: www.google-analytics.com +User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:15.0) Gecko/20100101 Firefox/15.0.1 +Accept: image/png,image/*;q=0.8,*/*;q=0.5 +Accept-Language: en-us,en;q=0.5 +Accept-Encoding: gzip, deflate +Connection: keep-alive +Referer: http://www.reddit.com/ + +GET /ados.js?q=43 HTTP/1.1 +Host: secure.adzerk.net +User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:15.0) Gecko/20100101 Firefox/15.0.1 +Accept: */* +Accept-Language: en-us,en;q=0.5 +Accept-Encoding: gzip, deflate +Connection: keep-alive +Referer: http://static.adzerk.net/reddit/ads.html?sr=-reddit.com&bust2 + +GET /fetch-trackers?callback=jQuery111005268222517967478_1400862512407&ids%5B%5D=t3_25jzeq-t8_k2ii&_=1400862512408 HTTP/1.1 +Host: tracker.redditmedia.com +User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:15.0) Gecko/20100101 Firefox/15.0.1 +Accept: */* +Accept-Language: en-us,en;q=0.5 +Accept-Encoding: gzip, deflate +Connection: keep-alive +Referer: http://www.reddit.com/ + +GET /ados?t=1400862512892&request={%22Placements%22:[{%22A%22:5146,%22S%22:24950,%22D%22:%22main%22,%22AT%22:5},{%22A%22:5146,%22S%22:24950,%22D%22:%22sponsorship%22,%22AT%22:8}],%22Keywords%22:%22-reddit.com%22,%22Referrer%22:%22http%3A%2F%2Fwww.reddit.com%2F%22,%22IsAsync%22:true,%22WriteResults%22:true} HTTP/1.1 +Host: engine.adzerk.net +User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:15.0) Gecko/20100101 Firefox/15.0.1 +Accept: */* +Accept-Language: en-us,en;q=0.5 +Accept-Encoding: gzip, deflate +Connection: keep-alive +Referer: http://static.adzerk.net/reddit/ads.html?sr=-reddit.com&bust2 + +GET /pixel/of_doom.png?id=t3_25jzeq-t8_k2ii&hash=da31d967485cdbd459ce1e9a5dde279fef7fc381&r=1738649500 HTTP/1.1 +Host: pixel.redditmedia.com +User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:15.0) Gecko/20100101 Firefox/15.0.1 +Accept: image/png,image/*;q=0.8,*/*;q=0.5 +Accept-Language: en-us,en;q=0.5 +Accept-Encoding: gzip, deflate +Connection: keep-alive +Referer: http://www.reddit.com/ + +GET /Extensions/adFeedback.js HTTP/1.1 +Host: static.adzrk.net +User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:15.0) Gecko/20100101 Firefox/15.0.1 +Accept: */* +Accept-Language: en-us,en;q=0.5 +Accept-Encoding: gzip, deflate +Connection: keep-alive +Referer: http://static.adzerk.net/reddit/ads.html?sr=-reddit.com&bust2 + +GET /Extensions/adFeedback.css HTTP/1.1 +Host: static.adzrk.net +User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:15.0) Gecko/20100101 Firefox/15.0.1 +Accept: text/css,*/*;q=0.1 +Accept-Language: en-us,en;q=0.5 +Accept-Encoding: gzip, deflate +Connection: keep-alive +Referer: http://static.adzerk.net/reddit/ads.html?sr=-reddit.com&bust2 + +GET /reddit/ads-load.html?bust2 HTTP/1.1 +Host: static.adzerk.net +User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:15.0) Gecko/20100101 Firefox/15.0.1 +Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8 +Accept-Language: en-us,en;q=0.5 +Accept-Encoding: gzip, deflate +Connection: keep-alive +Referer: http://www.reddit.com/ + +GET /Advertisers/a774d7d6148046efa89403a8db635a81.jpg HTTP/1.1 +Host: static.adzerk.net +User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:15.0) Gecko/20100101 Firefox/15.0.1 +Accept: image/png,image/*;q=0.8,*/*;q=0.5 +Accept-Language: en-us,en;q=0.5 +Accept-Encoding: gzip, deflate +Connection: keep-alive +Referer: http://static.adzerk.net/reddit/ads.html?sr=-reddit.com&bust2 + +GET /i.gif?e=eyJhdiI6NjIzNTcsImF0Ijo1LCJjbSI6MTE2MzUxLCJjaCI6Nzk4NCwiY3IiOjMzNzAxNSwiZGkiOiI4NmI2Y2UzYWM5NDM0MjhkOTk2ZTg4MjYwZDE5ZTE1YyIsImRtIjoxLCJmYyI6NDE2MTI4LCJmbCI6MjEwNDY0LCJrdyI6Ii1yZWRkaXQuY29tIiwibWsiOiItcmVkZGl0LmNvbSIsIm53Ijo1MTQ2LCJwYyI6MCwicHIiOjIwMzYyLCJydCI6MSwicmYiOiJodHRwOi8vd3d3LnJlZGRpdC5jb20vIiwic3QiOjI0OTUwLCJ1ayI6InVlMS01ZWIwOGFlZWQ5YTc0MDFjOTE5NWNiOTMzZWI3Yzk2NiIsInRzIjoxNDAwODYyNTkzNjQ1fQ&s=lwlbFf2Uywt7zVBFRj_qXXu7msY HTTP/1.1 +Host: engine.adzerk.net +User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:15.0) Gecko/20100101 Firefox/15.0.1 +Accept: image/png,image/*;q=0.8,*/*;q=0.5 +Accept-Language: en-us,en;q=0.5 +Accept-Encoding: gzip, deflate +Connection: keep-alive +Referer: http://static.adzerk.net/reddit/ads.html?sr=-reddit.com&bust2 +Cookie: azk=ue1-5eb08aeed9a7401c9195cb933eb7c966 + +GET /BurstingPipe/adServer.bs?cn=tf&c=19&mc=imp&pli=9994987&PluID=0&ord=1400862593644&rtu=-1 HTTP/1.1 +Host: bs.serving-sys.com +User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:15.0) Gecko/20100101 Firefox/15.0.1 +Accept: image/png,image/*;q=0.8,*/*;q=0.5 +Accept-Language: en-us,en;q=0.5 +Accept-Encoding: gzip, deflate +Connection: keep-alive +Referer: http://static.adzerk.net/reddit/ads.html?sr=-reddit.com&bust2 + +GET /Advertisers/63cfd0044ffd49c0a71a6626f7a1d8f0.jpg HTTP/1.1 +Host: static.adzerk.net +User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:15.0) Gecko/20100101 Firefox/15.0.1 +Accept: image/png,image/*;q=0.8,*/*;q=0.5 +Accept-Language: en-us,en;q=0.5 +Accept-Encoding: gzip, deflate +Connection: keep-alive +Referer: http://static.adzerk.net/reddit/ads-load.html?bust2 + +GET /BurstingPipe/adServer.bs?cn=tf&c=19&mc=imp&pli=9962555&PluID=0&ord=1400862593645&rtu=-1 HTTP/1.1 +Host: bs.serving-sys.com +User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:15.0) Gecko/20100101 Firefox/15.0.1 +Accept: image/png,image/*;q=0.8,*/*;q=0.5 +Accept-Language: en-us,en;q=0.5 +Accept-Encoding: gzip, deflate +Connection: keep-alive +Referer: http://static.adzerk.net/reddit/ads-load.html?bust2 +Cookie: S_9994987=6754579095859875029; A4=01fmFvgRnI09SF00000; u2=d1263d39-874b-4a89-86cd-a2ab0860ed4e3Zl040 + +GET /i.gif?e=eyJhdiI6NjIzNTcsImF0Ijo4LCJjbSI6MTE2MzUxLCJjaCI6Nzk4NCwiY3IiOjMzNzAxOCwiZGkiOiI3OTdlZjU3OWQ5NjE0ODdiODYyMGMyMGJkOTE4YzNiMSIsImRtIjoxLCJmYyI6NDE2MTMxLCJmbCI6MjEwNDY0LCJrdyI6Ii1yZWRkaXQuY29tIiwibWsiOiItcmVkZGl0LmNvbSIsIm53Ijo1MTQ2LCJwYyI6MCwicHIiOjIwMzYyLCJydCI6MSwicmYiOiJodHRwOi8vd3d3LnJlZGRpdC5jb20vIiwic3QiOjI0OTUwLCJ1ayI6InVlMS01ZWIwOGFlZWQ5YTc0MDFjOTE5NWNiOTMzZWI3Yzk2NiIsInRzIjoxNDAwODYyNTkzNjQ2fQ&s=OjzxzXAgQksbdQOHNm-bjZcnZPA HTTP/1.1 +Host: engine.adzerk.net +User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:15.0) Gecko/20100101 Firefox/15.0.1 +Accept: image/png,image/*;q=0.8,*/*;q=0.5 +Accept-Language: en-us,en;q=0.5 +Accept-Encoding: gzip, deflate +Connection: keep-alive +Referer: http://static.adzerk.net/reddit/ads-load.html?bust2 +Cookie: azk=ue1-5eb08aeed9a7401c9195cb933eb7c966 + +GET /subscribe?host_int=1042356184&ns_map=571794054_374233948806,464381511_13349283399&user_id=245722467&nid=1399334269710011966&ts=1400862514 HTTP/1.1 +Host: notify8.dropbox.com +Accept-Encoding: identity +Connection: keep-alive +X-Dropbox-Locale: en_US +User-Agent: DropboxDesktopClient/2.7.54 (Macintosh; 10.8; ('i32',); en_US) + diff --git a/benchmarks/unicode/http.ml b/benchmarks/unicode/http.ml new file mode 100644 index 00000000..653521fd --- /dev/null +++ b/benchmarks/unicode/http.ml @@ -0,0 +1,41 @@ +open Re_unicode.Utf8.Re + +let space = rep blank +let crlf = str "\r\n" (* ok if Utf8 *) +let token = rep1 @@ compl [ rg (char '\000') (char '\031'); set "\u{007f})(<>@,;:\\/[]?={}" ] +let meth = token + +let version = + let digits = rep1 digit in + let decimal = seq [ digits; opt (seq [ char '.' |> letter; digits ]) ] in + seq [ str "HTTP/"; decimal ] +;; + +let uri = rep1 (compl [ char '\n' |> letter ]) +let request_line = [ space; group meth; space; group uri; group version; space ] |> seq + +let header = + let key = group (rep1 (compl [ char ':' |> letter ])) in + let value = group (rep1 (compl [ char '\n' |> letter])) in + seq [ space; key; space; char ':' |> letter; space; value; space; crlf ] +;; + +let request' = seq [ request_line; crlf; rep header; crlf ] + +module Export = struct + let request = request' + let request_g = request' |> no_group + let requests = request' |> rep1 + let requests_g = request' |> no_group |> rep1 +end + +let requests = + Stdio.In_channel.read_all "benchmarks/unicode/http-requests.txt" + +let rec read_all pos re reqs = + if pos < String.length reqs + then ( + let g = exec ~pos re reqs in + let _, pos = Group.offset g 0 in + read_all (pos + 1) re reqs) +;; diff --git a/benchmarks/unicode/memory.ml b/benchmarks/unicode/memory.ml new file mode 100644 index 00000000..73fbaa70 --- /dev/null +++ b/benchmarks/unicode/memory.ml @@ -0,0 +1,32 @@ +open Core +(* This set of benchmarks is designed for testing re's memory usage rather than + speed. *) + +module Bench = Core_bench.Bench + + +let size = 1_000 + +(* a pathological re that will consume a bunch of memory *) +let re () = + let open Re_unicode.Utf8.Re in + compile @@ seq [ rep (set "01"); char '1' |> letter; repn (set "01") size (Some size) ] +;; + +(* Another pathological case that is a simplified version of the above *) +let re2 () = + let open Re_unicode.Utf8.Re in + seq [ rep (set "01"); char '1' |> letter; repn (set "01") size (Some size); char 'x' |> letter ] |> compile +;; + +let str = "01" ^ String.make size '1' + +let benchmarks = + [ "memory 1", re; "memory 2", re2 ] + |> ListLabels.map ~f:(fun (name, re) -> + Bench.Test.create_indexed ~name ~args:[ 10; 20; 40; 80; 100; size ] (fun len -> + Staged.stage (fun () -> + let re = re () in + let len = Int.min (String.length str) len in + ignore (Re_unicode.Utf8.Re.execp ~pos:0 ~len re str)))) +;; diff --git a/benchmarks/unicode/memory.mli b/benchmarks/unicode/memory.mli new file mode 100644 index 00000000..cf68f44a --- /dev/null +++ b/benchmarks/unicode/memory.mli @@ -0,0 +1 @@ +val benchmarks : Core_bench.Bench.Test.t list diff --git a/benchmarks/unicode/tex.gitignore b/benchmarks/unicode/tex.gitignore new file mode 100644 index 00000000..dcb47106 --- /dev/null +++ b/benchmarks/unicode/tex.gitignore @@ -0,0 +1,194 @@ +## Core latex/pdflatex auxiliary files: +*.aux +*.lof +*.log +*.lot +*.fls +*.out +*.toc +*.fmt +*.fot +*.cb +*.cb2 + +## Intermediate documents: +*.dvi +*-converted-to.* +# these rules might exclude image files for figures etc. +# *.ps +# *.eps +# *.pdf + +## Generated if empty string is given at "Please type another file name for output:" +.pdf + +## Bibliography auxiliary files (bibtex/biblatex/biber): +*.bbl +*.bcf +*.blg +*-blx.aux +*-blx.bib +*.brf +*.run.xml + +## Build tool auxiliary files: +*.fdb_latexmk +*.synctex +*.synctex(busy) +*.synctex.gz +*.synctex.gz(busy) +*.pdfsync + +## Auxiliary and intermediate files from other packages: +# algorithms +*.alg +*.loa + +# achemso +acs-*.bib + +# amsthm +*.thm + +# beamer +*.nav +*.snm +*.vrb + +# cprotect +*.cpt + +# fixme +*.lox + +#(r)(e)ledmac/(r)(e)ledpar +*.end +*.?end +*.[1-9] +*.[1-9][0-9] +*.[1-9][0-9][0-9] +*.[1-9]R +*.[1-9][0-9]R +*.[1-9][0-9][0-9]R +*.eledsec[1-9] +*.eledsec[1-9]R +*.eledsec[1-9][0-9] +*.eledsec[1-9][0-9]R +*.eledsec[1-9][0-9][0-9] +*.eledsec[1-9][0-9][0-9]R + +# glossaries +*.acn +*.acr +*.glg +*.glo +*.gls +*.glsdefs + +# gnuplottex +*-gnuplottex-* + +# gregoriotex +*.gaux +*.gtex + +# hyperref +*.brf + +# knitr +*-concordance.tex +# TODO Comment the next line if you want to keep your tikz graphics files +*.tikz +*-tikzDictionary + +# listings +*.lol + +# makeidx +*.idx +*.ilg +*.ind +*.ist + +# minitoc +*.maf +*.mlf +*.mlt +*.mtc +*.mtc[0-9] +*.mtc[1-9][0-9] + +# minted +_minted* +*.pyg + +# morewrites +*.mw + +# mylatexformat +*.fmt + +# nomencl +*.nlo + +# sagetex +*.sagetex.sage +*.sagetex.py +*.sagetex.scmd + +# scrwfile +*.wrt + +# sympy +*.sout +*.sympy +sympy-plots-for-*.tex/ + +# pdfcomment +*.upa +*.upb + +# pythontex +*.pytxcode +pythontex-files-*/ + +# thmtools +*.loe + +# TikZ & PGF +*.dpth +*.md5 +*.auxlock + +# todonotes +*.tdo + +# easy-todo +*.lod + +# xindy +*.xdy + +# xypic precompiled matrices +*.xyc + +# endfloat +*.ttt +*.fff + +# Latexian +TSWLatexianTemp* + +## Editors: +# WinEdt +*.bak +*.sav + +# Texpad +.texpadtmp + +# Kile +*.backup + +# KBibTeX +*~[0-9]* \ No newline at end of file diff --git a/benchmarks/unicode/tex.ml b/benchmarks/unicode/tex.ml new file mode 100644 index 00000000..564d642a --- /dev/null +++ b/benchmarks/unicode/tex.ml @@ -0,0 +1,18 @@ +open Core +module Re = Re_unicode.Utf8.Re + +let ignore_re = + Stdio.In_channel.read_lines "benchmarks/unicode/tex.gitignore" + |> List.map ~f:(fun s -> + match Base.String.lsplit2 s ~on:'#' with + | Some (pattern, _comment) -> pattern + | None -> s) + |> List.filter_map ~f:(fun s -> + match Base.String.strip s with + | "" -> None + | s -> Some s) + |> List.map ~f:Re.Glob.glob + |> Re.alt +;; + +let ignore_filesnames = Stdio.In_channel.read_lines "benchmarks/unicode/files" diff --git a/doc/CaseFolding.csv b/doc/CaseFolding.csv new file mode 100644 index 00000000..33944d83 --- /dev/null +++ b/doc/CaseFolding.csv @@ -0,0 +1,1592 @@ +0041; C; 0061; # LATIN CAPITAL LETTER A +0042; C; 0062; # LATIN CAPITAL LETTER B +0043; C; 0063; # LATIN CAPITAL LETTER C +0044; C; 0064; # LATIN CAPITAL LETTER D +0045; C; 0065; # LATIN CAPITAL LETTER E +0046; C; 0066; # LATIN CAPITAL LETTER F +0047; C; 0067; # LATIN CAPITAL LETTER G +0048; C; 0068; # LATIN CAPITAL LETTER H +0049; C; 0069; # LATIN CAPITAL LETTER I +0049; T; 0131; # LATIN CAPITAL LETTER I +004A; C; 006A; # LATIN CAPITAL LETTER J +004B; C; 006B; # LATIN CAPITAL LETTER K +004C; C; 006C; # LATIN CAPITAL LETTER L +004D; C; 006D; # LATIN CAPITAL LETTER M +004E; C; 006E; # LATIN CAPITAL LETTER N +004F; C; 006F; # LATIN CAPITAL LETTER O +0050; C; 0070; # LATIN CAPITAL LETTER P +0051; C; 0071; # LATIN CAPITAL LETTER Q +0052; C; 0072; # LATIN CAPITAL LETTER R +0053; C; 0073; # LATIN CAPITAL LETTER S +0054; C; 0074; # LATIN CAPITAL LETTER T +0055; C; 0075; # LATIN CAPITAL LETTER U +0056; C; 0076; # LATIN CAPITAL LETTER V +0057; C; 0077; # LATIN CAPITAL LETTER W +0058; C; 0078; # LATIN CAPITAL LETTER X +0059; C; 0079; # LATIN CAPITAL LETTER Y +005A; C; 007A; # LATIN CAPITAL LETTER Z +00B5; C; 03BC; # MICRO SIGN +00C0; C; 00E0; # LATIN CAPITAL LETTER A WITH GRAVE +00C1; C; 00E1; # LATIN CAPITAL LETTER A WITH ACUTE +00C2; C; 00E2; # LATIN CAPITAL LETTER A WITH CIRCUMFLEX +00C3; C; 00E3; # LATIN CAPITAL LETTER A WITH TILDE +00C4; C; 00E4; # LATIN CAPITAL LETTER A WITH DIAERESIS +00C5; C; 00E5; # LATIN CAPITAL LETTER A WITH RING ABOVE +00C6; C; 00E6; # LATIN CAPITAL LETTER AE +00C7; C; 00E7; # LATIN CAPITAL LETTER C WITH CEDILLA +00C8; C; 00E8; # LATIN CAPITAL LETTER E WITH GRAVE +00C9; C; 00E9; # LATIN CAPITAL LETTER E WITH ACUTE +00CA; C; 00EA; # LATIN CAPITAL LETTER E WITH CIRCUMFLEX +00CB; C; 00EB; # LATIN CAPITAL LETTER E WITH DIAERESIS +00CC; C; 00EC; # LATIN CAPITAL LETTER I WITH GRAVE +00CD; C; 00ED; # LATIN CAPITAL LETTER I WITH ACUTE +00CE; C; 00EE; # LATIN CAPITAL LETTER I WITH CIRCUMFLEX +00CF; C; 00EF; # LATIN CAPITAL LETTER I WITH DIAERESIS +00D0; C; 00F0; # LATIN CAPITAL LETTER ETH +00D1; C; 00F1; # LATIN CAPITAL LETTER N WITH TILDE +00D2; C; 00F2; # LATIN CAPITAL LETTER O WITH GRAVE +00D3; C; 00F3; # LATIN CAPITAL LETTER O WITH ACUTE +00D4; C; 00F4; # LATIN CAPITAL LETTER O WITH CIRCUMFLEX +00D5; C; 00F5; # LATIN CAPITAL LETTER O WITH TILDE +00D6; C; 00F6; # LATIN CAPITAL LETTER O WITH DIAERESIS +00D8; C; 00F8; # LATIN CAPITAL LETTER O WITH STROKE +00D9; C; 00F9; # LATIN CAPITAL LETTER U WITH GRAVE +00DA; C; 00FA; # LATIN CAPITAL LETTER U WITH ACUTE +00DB; C; 00FB; # LATIN CAPITAL LETTER U WITH CIRCUMFLEX +00DC; C; 00FC; # LATIN CAPITAL LETTER U WITH DIAERESIS +00DD; C; 00FD; # LATIN CAPITAL LETTER Y WITH ACUTE +00DE; C; 00FE; # LATIN CAPITAL LETTER THORN +00DF; F; 0073 0073; # LATIN SMALL LETTER SHARP S +0100; C; 0101; # LATIN CAPITAL LETTER A WITH MACRON +0102; C; 0103; # LATIN CAPITAL LETTER A WITH BREVE +0104; C; 0105; # LATIN CAPITAL LETTER A WITH OGONEK +0106; C; 0107; # LATIN CAPITAL LETTER C WITH ACUTE +0108; C; 0109; # LATIN CAPITAL LETTER C WITH CIRCUMFLEX +010A; C; 010B; # LATIN CAPITAL LETTER C WITH DOT ABOVE +010C; C; 010D; # LATIN CAPITAL LETTER C WITH CARON +010E; C; 010F; # LATIN CAPITAL LETTER D WITH CARON +0110; C; 0111; # LATIN CAPITAL LETTER D WITH STROKE +0112; C; 0113; # LATIN CAPITAL LETTER E WITH MACRON +0114; C; 0115; # LATIN CAPITAL LETTER E WITH BREVE +0116; C; 0117; # LATIN CAPITAL LETTER E WITH DOT ABOVE +0118; C; 0119; # LATIN CAPITAL LETTER E WITH OGONEK +011A; C; 011B; # LATIN CAPITAL LETTER E WITH CARON +011C; C; 011D; # LATIN CAPITAL LETTER G WITH CIRCUMFLEX +011E; C; 011F; # LATIN CAPITAL LETTER G WITH BREVE +0120; C; 0121; # LATIN CAPITAL LETTER G WITH DOT ABOVE +0122; C; 0123; # LATIN CAPITAL LETTER G WITH CEDILLA +0124; C; 0125; # LATIN CAPITAL LETTER H WITH CIRCUMFLEX +0126; C; 0127; # LATIN CAPITAL LETTER H WITH STROKE +0128; C; 0129; # LATIN CAPITAL LETTER I WITH TILDE +012A; C; 012B; # LATIN CAPITAL LETTER I WITH MACRON +012C; C; 012D; # LATIN CAPITAL LETTER I WITH BREVE +012E; C; 012F; # LATIN CAPITAL LETTER I WITH OGONEK +0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE +0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE +0132; C; 0133; # LATIN CAPITAL LIGATURE IJ +0134; C; 0135; # LATIN CAPITAL LETTER J WITH CIRCUMFLEX +0136; C; 0137; # LATIN CAPITAL LETTER K WITH CEDILLA +0139; C; 013A; # LATIN CAPITAL LETTER L WITH ACUTE +013B; C; 013C; # LATIN CAPITAL LETTER L WITH CEDILLA +013D; C; 013E; # LATIN CAPITAL LETTER L WITH CARON +013F; C; 0140; # LATIN CAPITAL LETTER L WITH MIDDLE DOT +0141; C; 0142; # LATIN CAPITAL LETTER L WITH STROKE +0143; C; 0144; # LATIN CAPITAL LETTER N WITH ACUTE +0145; C; 0146; # LATIN CAPITAL LETTER N WITH CEDILLA +0147; C; 0148; # LATIN CAPITAL LETTER N WITH CARON +0149; F; 02BC 006E; # LATIN SMALL LETTER N PRECEDED BY APOSTROPHE +014A; C; 014B; # LATIN CAPITAL LETTER ENG +014C; C; 014D; # LATIN CAPITAL LETTER O WITH MACRON +014E; C; 014F; # LATIN CAPITAL LETTER O WITH BREVE +0150; C; 0151; # LATIN CAPITAL LETTER O WITH DOUBLE ACUTE +0152; C; 0153; # LATIN CAPITAL LIGATURE OE +0154; C; 0155; # LATIN CAPITAL LETTER R WITH ACUTE +0156; C; 0157; # LATIN CAPITAL LETTER R WITH CEDILLA +0158; C; 0159; # LATIN CAPITAL LETTER R WITH CARON +015A; C; 015B; # LATIN CAPITAL LETTER S WITH ACUTE +015C; C; 015D; # LATIN CAPITAL LETTER S WITH CIRCUMFLEX +015E; C; 015F; # LATIN CAPITAL LETTER S WITH CEDILLA +0160; C; 0161; # LATIN CAPITAL LETTER S WITH CARON +0162; C; 0163; # LATIN CAPITAL LETTER T WITH CEDILLA +0164; C; 0165; # LATIN CAPITAL LETTER T WITH CARON +0166; C; 0167; # LATIN CAPITAL LETTER T WITH STROKE +0168; C; 0169; # LATIN CAPITAL LETTER U WITH TILDE +016A; C; 016B; # LATIN CAPITAL LETTER U WITH MACRON +016C; C; 016D; # LATIN CAPITAL LETTER U WITH BREVE +016E; C; 016F; # LATIN CAPITAL LETTER U WITH RING ABOVE +0170; C; 0171; # LATIN CAPITAL LETTER U WITH DOUBLE ACUTE +0172; C; 0173; # LATIN CAPITAL LETTER U WITH OGONEK +0174; C; 0175; # LATIN CAPITAL LETTER W WITH CIRCUMFLEX +0176; C; 0177; # LATIN CAPITAL LETTER Y WITH CIRCUMFLEX +0178; C; 00FF; # LATIN CAPITAL LETTER Y WITH DIAERESIS +0179; C; 017A; # LATIN CAPITAL LETTER Z WITH ACUTE +017B; C; 017C; # LATIN CAPITAL LETTER Z WITH DOT ABOVE +017D; C; 017E; # LATIN CAPITAL LETTER Z WITH CARON +017F; C; 0073; # LATIN SMALL LETTER LONG S +0181; C; 0253; # LATIN CAPITAL LETTER B WITH HOOK +0182; C; 0183; # LATIN CAPITAL LETTER B WITH TOPBAR +0184; C; 0185; # LATIN CAPITAL LETTER TONE SIX +0186; C; 0254; # LATIN CAPITAL LETTER OPEN O +0187; C; 0188; # LATIN CAPITAL LETTER C WITH HOOK +0189; C; 0256; # LATIN CAPITAL LETTER AFRICAN D +018A; C; 0257; # LATIN CAPITAL LETTER D WITH HOOK +018B; C; 018C; # LATIN CAPITAL LETTER D WITH TOPBAR +018E; C; 01DD; # LATIN CAPITAL LETTER REVERSED E +018F; C; 0259; # LATIN CAPITAL LETTER SCHWA +0190; C; 025B; # LATIN CAPITAL LETTER OPEN E +0191; C; 0192; # LATIN CAPITAL LETTER F WITH HOOK +0193; C; 0260; # LATIN CAPITAL LETTER G WITH HOOK +0194; C; 0263; # LATIN CAPITAL LETTER GAMMA +0196; C; 0269; # LATIN CAPITAL LETTER IOTA +0197; C; 0268; # LATIN CAPITAL LETTER I WITH STROKE +0198; C; 0199; # LATIN CAPITAL LETTER K WITH HOOK +019C; C; 026F; # LATIN CAPITAL LETTER TURNED M +019D; C; 0272; # LATIN CAPITAL LETTER N WITH LEFT HOOK +019F; C; 0275; # LATIN CAPITAL LETTER O WITH MIDDLE TILDE +01A0; C; 01A1; # LATIN CAPITAL LETTER O WITH HORN +01A2; C; 01A3; # LATIN CAPITAL LETTER OI +01A4; C; 01A5; # LATIN CAPITAL LETTER P WITH HOOK +01A6; C; 0280; # LATIN LETTER YR +01A7; C; 01A8; # LATIN CAPITAL LETTER TONE TWO +01A9; C; 0283; # LATIN CAPITAL LETTER ESH +01AC; C; 01AD; # LATIN CAPITAL LETTER T WITH HOOK +01AE; C; 0288; # LATIN CAPITAL LETTER T WITH RETROFLEX HOOK +01AF; C; 01B0; # LATIN CAPITAL LETTER U WITH HORN +01B1; C; 028A; # LATIN CAPITAL LETTER UPSILON +01B2; C; 028B; # LATIN CAPITAL LETTER V WITH HOOK +01B3; C; 01B4; # LATIN CAPITAL LETTER Y WITH HOOK +01B5; C; 01B6; # LATIN CAPITAL LETTER Z WITH STROKE +01B7; C; 0292; # LATIN CAPITAL LETTER EZH +01B8; C; 01B9; # LATIN CAPITAL LETTER EZH REVERSED +01BC; C; 01BD; # LATIN CAPITAL LETTER TONE FIVE +01C4; C; 01C6; # LATIN CAPITAL LETTER DZ WITH CARON +01C5; C; 01C6; # LATIN CAPITAL LETTER D WITH SMALL LETTER Z WITH CARON +01C7; C; 01C9; # LATIN CAPITAL LETTER LJ +01C8; C; 01C9; # LATIN CAPITAL LETTER L WITH SMALL LETTER J +01CA; C; 01CC; # LATIN CAPITAL LETTER NJ +01CB; C; 01CC; # LATIN CAPITAL LETTER N WITH SMALL LETTER J +01CD; C; 01CE; # LATIN CAPITAL LETTER A WITH CARON +01CF; C; 01D0; # LATIN CAPITAL LETTER I WITH CARON +01D1; C; 01D2; # LATIN CAPITAL LETTER O WITH CARON +01D3; C; 01D4; # LATIN CAPITAL LETTER U WITH CARON +01D5; C; 01D6; # LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON +01D7; C; 01D8; # LATIN CAPITAL LETTER U WITH DIAERESIS AND ACUTE +01D9; C; 01DA; # LATIN CAPITAL LETTER U WITH DIAERESIS AND CARON +01DB; C; 01DC; # LATIN CAPITAL LETTER U WITH DIAERESIS AND GRAVE +01DE; C; 01DF; # LATIN CAPITAL LETTER A WITH DIAERESIS AND MACRON +01E0; C; 01E1; # LATIN CAPITAL LETTER A WITH DOT ABOVE AND MACRON +01E2; C; 01E3; # LATIN CAPITAL LETTER AE WITH MACRON +01E4; C; 01E5; # LATIN CAPITAL LETTER G WITH STROKE +01E6; C; 01E7; # LATIN CAPITAL LETTER G WITH CARON +01E8; C; 01E9; # LATIN CAPITAL LETTER K WITH CARON +01EA; C; 01EB; # LATIN CAPITAL LETTER O WITH OGONEK +01EC; C; 01ED; # LATIN CAPITAL LETTER O WITH OGONEK AND MACRON +01EE; C; 01EF; # LATIN CAPITAL LETTER EZH WITH CARON +01F0; F; 006A 030C; # LATIN SMALL LETTER J WITH CARON +01F1; C; 01F3; # LATIN CAPITAL LETTER DZ +01F2; C; 01F3; # LATIN CAPITAL LETTER D WITH SMALL LETTER Z +01F4; C; 01F5; # LATIN CAPITAL LETTER G WITH ACUTE +01F6; C; 0195; # LATIN CAPITAL LETTER HWAIR +01F7; C; 01BF; # LATIN CAPITAL LETTER WYNN +01F8; C; 01F9; # LATIN CAPITAL LETTER N WITH GRAVE +01FA; C; 01FB; # LATIN CAPITAL LETTER A WITH RING ABOVE AND ACUTE +01FC; C; 01FD; # LATIN CAPITAL LETTER AE WITH ACUTE +01FE; C; 01FF; # LATIN CAPITAL LETTER O WITH STROKE AND ACUTE +0200; C; 0201; # LATIN CAPITAL LETTER A WITH DOUBLE GRAVE +0202; C; 0203; # LATIN CAPITAL LETTER A WITH INVERTED BREVE +0204; C; 0205; # LATIN CAPITAL LETTER E WITH DOUBLE GRAVE +0206; C; 0207; # LATIN CAPITAL LETTER E WITH INVERTED BREVE +0208; C; 0209; # LATIN CAPITAL LETTER I WITH DOUBLE GRAVE +020A; C; 020B; # LATIN CAPITAL LETTER I WITH INVERTED BREVE +020C; C; 020D; # LATIN CAPITAL LETTER O WITH DOUBLE GRAVE +020E; C; 020F; # LATIN CAPITAL LETTER O WITH INVERTED BREVE +0210; C; 0211; # LATIN CAPITAL LETTER R WITH DOUBLE GRAVE +0212; C; 0213; # LATIN CAPITAL LETTER R WITH INVERTED BREVE +0214; C; 0215; # LATIN CAPITAL LETTER U WITH DOUBLE GRAVE +0216; C; 0217; # LATIN CAPITAL LETTER U WITH INVERTED BREVE +0218; C; 0219; # LATIN CAPITAL LETTER S WITH COMMA BELOW +021A; C; 021B; # LATIN CAPITAL LETTER T WITH COMMA BELOW +021C; C; 021D; # LATIN CAPITAL LETTER YOGH +021E; C; 021F; # LATIN CAPITAL LETTER H WITH CARON +0220; C; 019E; # LATIN CAPITAL LETTER N WITH LONG RIGHT LEG +0222; C; 0223; # LATIN CAPITAL LETTER OU +0224; C; 0225; # LATIN CAPITAL LETTER Z WITH HOOK +0226; C; 0227; # LATIN CAPITAL LETTER A WITH DOT ABOVE +0228; C; 0229; # LATIN CAPITAL LETTER E WITH CEDILLA +022A; C; 022B; # LATIN CAPITAL LETTER O WITH DIAERESIS AND MACRON +022C; C; 022D; # LATIN CAPITAL LETTER O WITH TILDE AND MACRON +022E; C; 022F; # LATIN CAPITAL LETTER O WITH DOT ABOVE +0230; C; 0231; # LATIN CAPITAL LETTER O WITH DOT ABOVE AND MACRON +0232; C; 0233; # LATIN CAPITAL LETTER Y WITH MACRON +023A; C; 2C65; # LATIN CAPITAL LETTER A WITH STROKE +023B; C; 023C; # LATIN CAPITAL LETTER C WITH STROKE +023D; C; 019A; # LATIN CAPITAL LETTER L WITH BAR +023E; C; 2C66; # LATIN CAPITAL LETTER T WITH DIAGONAL STROKE +0241; C; 0242; # LATIN CAPITAL LETTER GLOTTAL STOP +0243; C; 0180; # LATIN CAPITAL LETTER B WITH STROKE +0244; C; 0289; # LATIN CAPITAL LETTER U BAR +0245; C; 028C; # LATIN CAPITAL LETTER TURNED V +0246; C; 0247; # LATIN CAPITAL LETTER E WITH STROKE +0248; C; 0249; # LATIN CAPITAL LETTER J WITH STROKE +024A; C; 024B; # LATIN CAPITAL LETTER SMALL Q WITH HOOK TAIL +024C; C; 024D; # LATIN CAPITAL LETTER R WITH STROKE +024E; C; 024F; # LATIN CAPITAL LETTER Y WITH STROKE +0345; C; 03B9; # COMBINING GREEK YPOGEGRAMMENI +0370; C; 0371; # GREEK CAPITAL LETTER HETA +0372; C; 0373; # GREEK CAPITAL LETTER ARCHAIC SAMPI +0376; C; 0377; # GREEK CAPITAL LETTER PAMPHYLIAN DIGAMMA +037F; C; 03F3; # GREEK CAPITAL LETTER YOT +0386; C; 03AC; # GREEK CAPITAL LETTER ALPHA WITH TONOS +0388; C; 03AD; # GREEK CAPITAL LETTER EPSILON WITH TONOS +0389; C; 03AE; # GREEK CAPITAL LETTER ETA WITH TONOS +038A; C; 03AF; # GREEK CAPITAL LETTER IOTA WITH TONOS +038C; C; 03CC; # GREEK CAPITAL LETTER OMICRON WITH TONOS +038E; C; 03CD; # GREEK CAPITAL LETTER UPSILON WITH TONOS +038F; C; 03CE; # GREEK CAPITAL LETTER OMEGA WITH TONOS +0390; F; 03B9 0308 0301; # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS +0391; C; 03B1; # GREEK CAPITAL LETTER ALPHA +0392; C; 03B2; # GREEK CAPITAL LETTER BETA +0393; C; 03B3; # GREEK CAPITAL LETTER GAMMA +0394; C; 03B4; # GREEK CAPITAL LETTER DELTA +0395; C; 03B5; # GREEK CAPITAL LETTER EPSILON +0396; C; 03B6; # GREEK CAPITAL LETTER ZETA +0397; C; 03B7; # GREEK CAPITAL LETTER ETA +0398; C; 03B8; # GREEK CAPITAL LETTER THETA +0399; C; 03B9; # GREEK CAPITAL LETTER IOTA +039A; C; 03BA; # GREEK CAPITAL LETTER KAPPA +039B; C; 03BB; # GREEK CAPITAL LETTER LAMDA +039C; C; 03BC; # GREEK CAPITAL LETTER MU +039D; C; 03BD; # GREEK CAPITAL LETTER NU +039E; C; 03BE; # GREEK CAPITAL LETTER XI +039F; C; 03BF; # GREEK CAPITAL LETTER OMICRON +03A0; C; 03C0; # GREEK CAPITAL LETTER PI +03A1; C; 03C1; # GREEK CAPITAL LETTER RHO +03A3; C; 03C3; # GREEK CAPITAL LETTER SIGMA +03A4; C; 03C4; # GREEK CAPITAL LETTER TAU +03A5; C; 03C5; # GREEK CAPITAL LETTER UPSILON +03A6; C; 03C6; # GREEK CAPITAL LETTER PHI +03A7; C; 03C7; # GREEK CAPITAL LETTER CHI +03A8; C; 03C8; # GREEK CAPITAL LETTER PSI +03A9; C; 03C9; # GREEK CAPITAL LETTER OMEGA +03AA; C; 03CA; # GREEK CAPITAL LETTER IOTA WITH DIALYTIKA +03AB; C; 03CB; # GREEK CAPITAL LETTER UPSILON WITH DIALYTIKA +03B0; F; 03C5 0308 0301; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS +03C2; C; 03C3; # GREEK SMALL LETTER FINAL SIGMA +03CF; C; 03D7; # GREEK CAPITAL KAI SYMBOL +03D0; C; 03B2; # GREEK BETA SYMBOL +03D1; C; 03B8; # GREEK THETA SYMBOL +03D5; C; 03C6; # GREEK PHI SYMBOL +03D6; C; 03C0; # GREEK PI SYMBOL +03D8; C; 03D9; # GREEK LETTER ARCHAIC KOPPA +03DA; C; 03DB; # GREEK LETTER STIGMA +03DC; C; 03DD; # GREEK LETTER DIGAMMA +03DE; C; 03DF; # GREEK LETTER KOPPA +03E0; C; 03E1; # GREEK LETTER SAMPI +03E2; C; 03E3; # COPTIC CAPITAL LETTER SHEI +03E4; C; 03E5; # COPTIC CAPITAL LETTER FEI +03E6; C; 03E7; # COPTIC CAPITAL LETTER KHEI +03E8; C; 03E9; # COPTIC CAPITAL LETTER HORI +03EA; C; 03EB; # COPTIC CAPITAL LETTER GANGIA +03EC; C; 03ED; # COPTIC CAPITAL LETTER SHIMA +03EE; C; 03EF; # COPTIC CAPITAL LETTER DEI +03F0; C; 03BA; # GREEK KAPPA SYMBOL +03F1; C; 03C1; # GREEK RHO SYMBOL +03F4; C; 03B8; # GREEK CAPITAL THETA SYMBOL +03F5; C; 03B5; # GREEK LUNATE EPSILON SYMBOL +03F7; C; 03F8; # GREEK CAPITAL LETTER SHO +03F9; C; 03F2; # GREEK CAPITAL LUNATE SIGMA SYMBOL +03FA; C; 03FB; # GREEK CAPITAL LETTER SAN +03FD; C; 037B; # GREEK CAPITAL REVERSED LUNATE SIGMA SYMBOL +03FE; C; 037C; # GREEK CAPITAL DOTTED LUNATE SIGMA SYMBOL +03FF; C; 037D; # GREEK CAPITAL REVERSED DOTTED LUNATE SIGMA SYMBOL +0400; C; 0450; # CYRILLIC CAPITAL LETTER IE WITH GRAVE +0401; C; 0451; # CYRILLIC CAPITAL LETTER IO +0402; C; 0452; # CYRILLIC CAPITAL LETTER DJE +0403; C; 0453; # CYRILLIC CAPITAL LETTER GJE +0404; C; 0454; # CYRILLIC CAPITAL LETTER UKRAINIAN IE +0405; C; 0455; # CYRILLIC CAPITAL LETTER DZE +0406; C; 0456; # CYRILLIC CAPITAL LETTER BYELORUSSIAN-UKRAINIAN I +0407; C; 0457; # CYRILLIC CAPITAL LETTER YI +0408; C; 0458; # CYRILLIC CAPITAL LETTER JE +0409; C; 0459; # CYRILLIC CAPITAL LETTER LJE +040A; C; 045A; # CYRILLIC CAPITAL LETTER NJE +040B; C; 045B; # CYRILLIC CAPITAL LETTER TSHE +040C; C; 045C; # CYRILLIC CAPITAL LETTER KJE +040D; C; 045D; # CYRILLIC CAPITAL LETTER I WITH GRAVE +040E; C; 045E; # CYRILLIC CAPITAL LETTER SHORT U +040F; C; 045F; # CYRILLIC CAPITAL LETTER DZHE +0410; C; 0430; # CYRILLIC CAPITAL LETTER A +0411; C; 0431; # CYRILLIC CAPITAL LETTER BE +0412; C; 0432; # CYRILLIC CAPITAL LETTER VE +0413; C; 0433; # CYRILLIC CAPITAL LETTER GHE +0414; C; 0434; # CYRILLIC CAPITAL LETTER DE +0415; C; 0435; # CYRILLIC CAPITAL LETTER IE +0416; C; 0436; # CYRILLIC CAPITAL LETTER ZHE +0417; C; 0437; # CYRILLIC CAPITAL LETTER ZE +0418; C; 0438; # CYRILLIC CAPITAL LETTER I +0419; C; 0439; # CYRILLIC CAPITAL LETTER SHORT I +041A; C; 043A; # CYRILLIC CAPITAL LETTER KA +041B; C; 043B; # CYRILLIC CAPITAL LETTER EL +041C; C; 043C; # CYRILLIC CAPITAL LETTER EM +041D; C; 043D; # CYRILLIC CAPITAL LETTER EN +041E; C; 043E; # CYRILLIC CAPITAL LETTER O +041F; C; 043F; # CYRILLIC CAPITAL LETTER PE +0420; C; 0440; # CYRILLIC CAPITAL LETTER ER +0421; C; 0441; # CYRILLIC CAPITAL LETTER ES +0422; C; 0442; # CYRILLIC CAPITAL LETTER TE +0423; C; 0443; # CYRILLIC CAPITAL LETTER U +0424; C; 0444; # CYRILLIC CAPITAL LETTER EF +0425; C; 0445; # CYRILLIC CAPITAL LETTER HA +0426; C; 0446; # CYRILLIC CAPITAL LETTER TSE +0427; C; 0447; # CYRILLIC CAPITAL LETTER CHE +0428; C; 0448; # CYRILLIC CAPITAL LETTER SHA +0429; C; 0449; # CYRILLIC CAPITAL LETTER SHCHA +042A; C; 044A; # CYRILLIC CAPITAL LETTER HARD SIGN +042B; C; 044B; # CYRILLIC CAPITAL LETTER YERU +042C; C; 044C; # CYRILLIC CAPITAL LETTER SOFT SIGN +042D; C; 044D; # CYRILLIC CAPITAL LETTER E +042E; C; 044E; # CYRILLIC CAPITAL LETTER YU +042F; C; 044F; # CYRILLIC CAPITAL LETTER YA +0460; C; 0461; # CYRILLIC CAPITAL LETTER OMEGA +0462; C; 0463; # CYRILLIC CAPITAL LETTER YAT +0464; C; 0465; # CYRILLIC CAPITAL LETTER IOTIFIED E +0466; C; 0467; # CYRILLIC CAPITAL LETTER LITTLE YUS +0468; C; 0469; # CYRILLIC CAPITAL LETTER IOTIFIED LITTLE YUS +046A; C; 046B; # CYRILLIC CAPITAL LETTER BIG YUS +046C; C; 046D; # CYRILLIC CAPITAL LETTER IOTIFIED BIG YUS +046E; C; 046F; # CYRILLIC CAPITAL LETTER KSI +0470; C; 0471; # CYRILLIC CAPITAL LETTER PSI +0472; C; 0473; # CYRILLIC CAPITAL LETTER FITA +0474; C; 0475; # CYRILLIC CAPITAL LETTER IZHITSA +0476; C; 0477; # CYRILLIC CAPITAL LETTER IZHITSA WITH DOUBLE GRAVE ACCENT +0478; C; 0479; # CYRILLIC CAPITAL LETTER UK +047A; C; 047B; # CYRILLIC CAPITAL LETTER ROUND OMEGA +047C; C; 047D; # CYRILLIC CAPITAL LETTER OMEGA WITH TITLO +047E; C; 047F; # CYRILLIC CAPITAL LETTER OT +0480; C; 0481; # CYRILLIC CAPITAL LETTER KOPPA +048A; C; 048B; # CYRILLIC CAPITAL LETTER SHORT I WITH TAIL +048C; C; 048D; # CYRILLIC CAPITAL LETTER SEMISOFT SIGN +048E; C; 048F; # CYRILLIC CAPITAL LETTER ER WITH TICK +0490; C; 0491; # CYRILLIC CAPITAL LETTER GHE WITH UPTURN +0492; C; 0493; # CYRILLIC CAPITAL LETTER GHE WITH STROKE +0494; C; 0495; # CYRILLIC CAPITAL LETTER GHE WITH MIDDLE HOOK +0496; C; 0497; # CYRILLIC CAPITAL LETTER ZHE WITH DESCENDER +0498; C; 0499; # CYRILLIC CAPITAL LETTER ZE WITH DESCENDER +049A; C; 049B; # CYRILLIC CAPITAL LETTER KA WITH DESCENDER +049C; C; 049D; # CYRILLIC CAPITAL LETTER KA WITH VERTICAL STROKE +049E; C; 049F; # CYRILLIC CAPITAL LETTER KA WITH STROKE +04A0; C; 04A1; # CYRILLIC CAPITAL LETTER BASHKIR KA +04A2; C; 04A3; # CYRILLIC CAPITAL LETTER EN WITH DESCENDER +04A4; C; 04A5; # CYRILLIC CAPITAL LIGATURE EN GHE +04A6; C; 04A7; # CYRILLIC CAPITAL LETTER PE WITH MIDDLE HOOK +04A8; C; 04A9; # CYRILLIC CAPITAL LETTER ABKHASIAN HA +04AA; C; 04AB; # CYRILLIC CAPITAL LETTER ES WITH DESCENDER +04AC; C; 04AD; # CYRILLIC CAPITAL LETTER TE WITH DESCENDER +04AE; C; 04AF; # CYRILLIC CAPITAL LETTER STRAIGHT U +04B0; C; 04B1; # CYRILLIC CAPITAL LETTER STRAIGHT U WITH STROKE +04B2; C; 04B3; # CYRILLIC CAPITAL LETTER HA WITH DESCENDER +04B4; C; 04B5; # CYRILLIC CAPITAL LIGATURE TE TSE +04B6; C; 04B7; # CYRILLIC CAPITAL LETTER CHE WITH DESCENDER +04B8; C; 04B9; # CYRILLIC CAPITAL LETTER CHE WITH VERTICAL STROKE +04BA; C; 04BB; # CYRILLIC CAPITAL LETTER SHHA +04BC; C; 04BD; # CYRILLIC CAPITAL LETTER ABKHASIAN CHE +04BE; C; 04BF; # CYRILLIC CAPITAL LETTER ABKHASIAN CHE WITH DESCENDER +04C0; C; 04CF; # CYRILLIC LETTER PALOCHKA +04C1; C; 04C2; # CYRILLIC CAPITAL LETTER ZHE WITH BREVE +04C3; C; 04C4; # CYRILLIC CAPITAL LETTER KA WITH HOOK +04C5; C; 04C6; # CYRILLIC CAPITAL LETTER EL WITH TAIL +04C7; C; 04C8; # CYRILLIC CAPITAL LETTER EN WITH HOOK +04C9; C; 04CA; # CYRILLIC CAPITAL LETTER EN WITH TAIL +04CB; C; 04CC; # CYRILLIC CAPITAL LETTER KHAKASSIAN CHE +04CD; C; 04CE; # CYRILLIC CAPITAL LETTER EM WITH TAIL +04D0; C; 04D1; # CYRILLIC CAPITAL LETTER A WITH BREVE +04D2; C; 04D3; # CYRILLIC CAPITAL LETTER A WITH DIAERESIS +04D4; C; 04D5; # CYRILLIC CAPITAL LIGATURE A IE +04D6; C; 04D7; # CYRILLIC CAPITAL LETTER IE WITH BREVE +04D8; C; 04D9; # CYRILLIC CAPITAL LETTER SCHWA +04DA; C; 04DB; # CYRILLIC CAPITAL LETTER SCHWA WITH DIAERESIS +04DC; C; 04DD; # CYRILLIC CAPITAL LETTER ZHE WITH DIAERESIS +04DE; C; 04DF; # CYRILLIC CAPITAL LETTER ZE WITH DIAERESIS +04E0; C; 04E1; # CYRILLIC CAPITAL LETTER ABKHASIAN DZE +04E2; C; 04E3; # CYRILLIC CAPITAL LETTER I WITH MACRON +04E4; C; 04E5; # CYRILLIC CAPITAL LETTER I WITH DIAERESIS +04E6; C; 04E7; # CYRILLIC CAPITAL LETTER O WITH DIAERESIS +04E8; C; 04E9; # CYRILLIC CAPITAL LETTER BARRED O +04EA; C; 04EB; # CYRILLIC CAPITAL LETTER BARRED O WITH DIAERESIS +04EC; C; 04ED; # CYRILLIC CAPITAL LETTER E WITH DIAERESIS +04EE; C; 04EF; # CYRILLIC CAPITAL LETTER U WITH MACRON +04F0; C; 04F1; # CYRILLIC CAPITAL LETTER U WITH DIAERESIS +04F2; C; 04F3; # CYRILLIC CAPITAL LETTER U WITH DOUBLE ACUTE +04F4; C; 04F5; # CYRILLIC CAPITAL LETTER CHE WITH DIAERESIS +04F6; C; 04F7; # CYRILLIC CAPITAL LETTER GHE WITH DESCENDER +04F8; C; 04F9; # CYRILLIC CAPITAL LETTER YERU WITH DIAERESIS +04FA; C; 04FB; # CYRILLIC CAPITAL LETTER GHE WITH STROKE AND HOOK +04FC; C; 04FD; # CYRILLIC CAPITAL LETTER HA WITH HOOK +04FE; C; 04FF; # CYRILLIC CAPITAL LETTER HA WITH STROKE +0500; C; 0501; # CYRILLIC CAPITAL LETTER KOMI DE +0502; C; 0503; # CYRILLIC CAPITAL LETTER KOMI DJE +0504; C; 0505; # CYRILLIC CAPITAL LETTER KOMI ZJE +0506; C; 0507; # CYRILLIC CAPITAL LETTER KOMI DZJE +0508; C; 0509; # CYRILLIC CAPITAL LETTER KOMI LJE +050A; C; 050B; # CYRILLIC CAPITAL LETTER KOMI NJE +050C; C; 050D; # CYRILLIC CAPITAL LETTER KOMI SJE +050E; C; 050F; # CYRILLIC CAPITAL LETTER KOMI TJE +0510; C; 0511; # CYRILLIC CAPITAL LETTER REVERSED ZE +0512; C; 0513; # CYRILLIC CAPITAL LETTER EL WITH HOOK +0514; C; 0515; # CYRILLIC CAPITAL LETTER LHA +0516; C; 0517; # CYRILLIC CAPITAL LETTER RHA +0518; C; 0519; # CYRILLIC CAPITAL LETTER YAE +051A; C; 051B; # CYRILLIC CAPITAL LETTER QA +051C; C; 051D; # CYRILLIC CAPITAL LETTER WE +051E; C; 051F; # CYRILLIC CAPITAL LETTER ALEUT KA +0520; C; 0521; # CYRILLIC CAPITAL LETTER EL WITH MIDDLE HOOK +0522; C; 0523; # CYRILLIC CAPITAL LETTER EN WITH MIDDLE HOOK +0524; C; 0525; # CYRILLIC CAPITAL LETTER PE WITH DESCENDER +0526; C; 0527; # CYRILLIC CAPITAL LETTER SHHA WITH DESCENDER +0528; C; 0529; # CYRILLIC CAPITAL LETTER EN WITH LEFT HOOK +052A; C; 052B; # CYRILLIC CAPITAL LETTER DZZHE +052C; C; 052D; # CYRILLIC CAPITAL LETTER DCHE +052E; C; 052F; # CYRILLIC CAPITAL LETTER EL WITH DESCENDER +0531; C; 0561; # ARMENIAN CAPITAL LETTER AYB +0532; C; 0562; # ARMENIAN CAPITAL LETTER BEN +0533; C; 0563; # ARMENIAN CAPITAL LETTER GIM +0534; C; 0564; # ARMENIAN CAPITAL LETTER DA +0535; C; 0565; # ARMENIAN CAPITAL LETTER ECH +0536; C; 0566; # ARMENIAN CAPITAL LETTER ZA +0537; C; 0567; # ARMENIAN CAPITAL LETTER EH +0538; C; 0568; # ARMENIAN CAPITAL LETTER ET +0539; C; 0569; # ARMENIAN CAPITAL LETTER TO +053A; C; 056A; # ARMENIAN CAPITAL LETTER ZHE +053B; C; 056B; # ARMENIAN CAPITAL LETTER INI +053C; C; 056C; # ARMENIAN CAPITAL LETTER LIWN +053D; C; 056D; # ARMENIAN CAPITAL LETTER XEH +053E; C; 056E; # ARMENIAN CAPITAL LETTER CA +053F; C; 056F; # ARMENIAN CAPITAL LETTER KEN +0540; C; 0570; # ARMENIAN CAPITAL LETTER HO +0541; C; 0571; # ARMENIAN CAPITAL LETTER JA +0542; C; 0572; # ARMENIAN CAPITAL LETTER GHAD +0543; C; 0573; # ARMENIAN CAPITAL LETTER CHEH +0544; C; 0574; # ARMENIAN CAPITAL LETTER MEN +0545; C; 0575; # ARMENIAN CAPITAL LETTER YI +0546; C; 0576; # ARMENIAN CAPITAL LETTER NOW +0547; C; 0577; # ARMENIAN CAPITAL LETTER SHA +0548; C; 0578; # ARMENIAN CAPITAL LETTER VO +0549; C; 0579; # ARMENIAN CAPITAL LETTER CHA +054A; C; 057A; # ARMENIAN CAPITAL LETTER PEH +054B; C; 057B; # ARMENIAN CAPITAL LETTER JHEH +054C; C; 057C; # ARMENIAN CAPITAL LETTER RA +054D; C; 057D; # ARMENIAN CAPITAL LETTER SEH +054E; C; 057E; # ARMENIAN CAPITAL LETTER VEW +054F; C; 057F; # ARMENIAN CAPITAL LETTER TIWN +0550; C; 0580; # ARMENIAN CAPITAL LETTER REH +0551; C; 0581; # ARMENIAN CAPITAL LETTER CO +0552; C; 0582; # ARMENIAN CAPITAL LETTER YIWN +0553; C; 0583; # ARMENIAN CAPITAL LETTER PIWR +0554; C; 0584; # ARMENIAN CAPITAL LETTER KEH +0555; C; 0585; # ARMENIAN CAPITAL LETTER OH +0556; C; 0586; # ARMENIAN CAPITAL LETTER FEH +0587; F; 0565 0582; # ARMENIAN SMALL LIGATURE ECH YIWN +10A0; C; 2D00; # GEORGIAN CAPITAL LETTER AN +10A1; C; 2D01; # GEORGIAN CAPITAL LETTER BAN +10A2; C; 2D02; # GEORGIAN CAPITAL LETTER GAN +10A3; C; 2D03; # GEORGIAN CAPITAL LETTER DON +10A4; C; 2D04; # GEORGIAN CAPITAL LETTER EN +10A5; C; 2D05; # GEORGIAN CAPITAL LETTER VIN +10A6; C; 2D06; # GEORGIAN CAPITAL LETTER ZEN +10A7; C; 2D07; # GEORGIAN CAPITAL LETTER TAN +10A8; C; 2D08; # GEORGIAN CAPITAL LETTER IN +10A9; C; 2D09; # GEORGIAN CAPITAL LETTER KAN +10AA; C; 2D0A; # GEORGIAN CAPITAL LETTER LAS +10AB; C; 2D0B; # GEORGIAN CAPITAL LETTER MAN +10AC; C; 2D0C; # GEORGIAN CAPITAL LETTER NAR +10AD; C; 2D0D; # GEORGIAN CAPITAL LETTER ON +10AE; C; 2D0E; # GEORGIAN CAPITAL LETTER PAR +10AF; C; 2D0F; # GEORGIAN CAPITAL LETTER ZHAR +10B0; C; 2D10; # GEORGIAN CAPITAL LETTER RAE +10B1; C; 2D11; # GEORGIAN CAPITAL LETTER SAN +10B2; C; 2D12; # GEORGIAN CAPITAL LETTER TAR +10B3; C; 2D13; # GEORGIAN CAPITAL LETTER UN +10B4; C; 2D14; # GEORGIAN CAPITAL LETTER PHAR +10B5; C; 2D15; # GEORGIAN CAPITAL LETTER KHAR +10B6; C; 2D16; # GEORGIAN CAPITAL LETTER GHAN +10B7; C; 2D17; # GEORGIAN CAPITAL LETTER QAR +10B8; C; 2D18; # GEORGIAN CAPITAL LETTER SHIN +10B9; C; 2D19; # GEORGIAN CAPITAL LETTER CHIN +10BA; C; 2D1A; # GEORGIAN CAPITAL LETTER CAN +10BB; C; 2D1B; # GEORGIAN CAPITAL LETTER JIL +10BC; C; 2D1C; # GEORGIAN CAPITAL LETTER CIL +10BD; C; 2D1D; # GEORGIAN CAPITAL LETTER CHAR +10BE; C; 2D1E; # GEORGIAN CAPITAL LETTER XAN +10BF; C; 2D1F; # GEORGIAN CAPITAL LETTER JHAN +10C0; C; 2D20; # GEORGIAN CAPITAL LETTER HAE +10C1; C; 2D21; # GEORGIAN CAPITAL LETTER HE +10C2; C; 2D22; # GEORGIAN CAPITAL LETTER HIE +10C3; C; 2D23; # GEORGIAN CAPITAL LETTER WE +10C4; C; 2D24; # GEORGIAN CAPITAL LETTER HAR +10C5; C; 2D25; # GEORGIAN CAPITAL LETTER HOE +10C7; C; 2D27; # GEORGIAN CAPITAL LETTER YN +10CD; C; 2D2D; # GEORGIAN CAPITAL LETTER AEN +13F8; C; 13F0; # CHEROKEE SMALL LETTER YE +13F9; C; 13F1; # CHEROKEE SMALL LETTER YI +13FA; C; 13F2; # CHEROKEE SMALL LETTER YO +13FB; C; 13F3; # CHEROKEE SMALL LETTER YU +13FC; C; 13F4; # CHEROKEE SMALL LETTER YV +13FD; C; 13F5; # CHEROKEE SMALL LETTER MV +1C80; C; 0432; # CYRILLIC SMALL LETTER ROUNDED VE +1C81; C; 0434; # CYRILLIC SMALL LETTER LONG-LEGGED DE +1C82; C; 043E; # CYRILLIC SMALL LETTER NARROW O +1C83; C; 0441; # CYRILLIC SMALL LETTER WIDE ES +1C84; C; 0442; # CYRILLIC SMALL LETTER TALL TE +1C85; C; 0442; # CYRILLIC SMALL LETTER THREE-LEGGED TE +1C86; C; 044A; # CYRILLIC SMALL LETTER TALL HARD SIGN +1C87; C; 0463; # CYRILLIC SMALL LETTER TALL YAT +1C88; C; A64B; # CYRILLIC SMALL LETTER UNBLENDED UK +1C89; C; 1C8A; # CYRILLIC CAPITAL LETTER TJE +1C90; C; 10D0; # GEORGIAN MTAVRULI CAPITAL LETTER AN +1C91; C; 10D1; # GEORGIAN MTAVRULI CAPITAL LETTER BAN +1C92; C; 10D2; # GEORGIAN MTAVRULI CAPITAL LETTER GAN +1C93; C; 10D3; # GEORGIAN MTAVRULI CAPITAL LETTER DON +1C94; C; 10D4; # GEORGIAN MTAVRULI CAPITAL LETTER EN +1C95; C; 10D5; # GEORGIAN MTAVRULI CAPITAL LETTER VIN +1C96; C; 10D6; # GEORGIAN MTAVRULI CAPITAL LETTER ZEN +1C97; C; 10D7; # GEORGIAN MTAVRULI CAPITAL LETTER TAN +1C98; C; 10D8; # GEORGIAN MTAVRULI CAPITAL LETTER IN +1C99; C; 10D9; # GEORGIAN MTAVRULI CAPITAL LETTER KAN +1C9A; C; 10DA; # GEORGIAN MTAVRULI CAPITAL LETTER LAS +1C9B; C; 10DB; # GEORGIAN MTAVRULI CAPITAL LETTER MAN +1C9C; C; 10DC; # GEORGIAN MTAVRULI CAPITAL LETTER NAR +1C9D; C; 10DD; # GEORGIAN MTAVRULI CAPITAL LETTER ON +1C9E; C; 10DE; # GEORGIAN MTAVRULI CAPITAL LETTER PAR +1C9F; C; 10DF; # GEORGIAN MTAVRULI CAPITAL LETTER ZHAR +1CA0; C; 10E0; # GEORGIAN MTAVRULI CAPITAL LETTER RAE +1CA1; C; 10E1; # GEORGIAN MTAVRULI CAPITAL LETTER SAN +1CA2; C; 10E2; # GEORGIAN MTAVRULI CAPITAL LETTER TAR +1CA3; C; 10E3; # GEORGIAN MTAVRULI CAPITAL LETTER UN +1CA4; C; 10E4; # GEORGIAN MTAVRULI CAPITAL LETTER PHAR +1CA5; C; 10E5; # GEORGIAN MTAVRULI CAPITAL LETTER KHAR +1CA6; C; 10E6; # GEORGIAN MTAVRULI CAPITAL LETTER GHAN +1CA7; C; 10E7; # GEORGIAN MTAVRULI CAPITAL LETTER QAR +1CA8; C; 10E8; # GEORGIAN MTAVRULI CAPITAL LETTER SHIN +1CA9; C; 10E9; # GEORGIAN MTAVRULI CAPITAL LETTER CHIN +1CAA; C; 10EA; # GEORGIAN MTAVRULI CAPITAL LETTER CAN +1CAB; C; 10EB; # GEORGIAN MTAVRULI CAPITAL LETTER JIL +1CAC; C; 10EC; # GEORGIAN MTAVRULI CAPITAL LETTER CIL +1CAD; C; 10ED; # GEORGIAN MTAVRULI CAPITAL LETTER CHAR +1CAE; C; 10EE; # GEORGIAN MTAVRULI CAPITAL LETTER XAN +1CAF; C; 10EF; # GEORGIAN MTAVRULI CAPITAL LETTER JHAN +1CB0; C; 10F0; # GEORGIAN MTAVRULI CAPITAL LETTER HAE +1CB1; C; 10F1; # GEORGIAN MTAVRULI CAPITAL LETTER HE +1CB2; C; 10F2; # GEORGIAN MTAVRULI CAPITAL LETTER HIE +1CB3; C; 10F3; # GEORGIAN MTAVRULI CAPITAL LETTER WE +1CB4; C; 10F4; # GEORGIAN MTAVRULI CAPITAL LETTER HAR +1CB5; C; 10F5; # GEORGIAN MTAVRULI CAPITAL LETTER HOE +1CB6; C; 10F6; # GEORGIAN MTAVRULI CAPITAL LETTER FI +1CB7; C; 10F7; # GEORGIAN MTAVRULI CAPITAL LETTER YN +1CB8; C; 10F8; # GEORGIAN MTAVRULI CAPITAL LETTER ELIFI +1CB9; C; 10F9; # GEORGIAN MTAVRULI CAPITAL LETTER TURNED GAN +1CBA; C; 10FA; # GEORGIAN MTAVRULI CAPITAL LETTER AIN +1CBD; C; 10FD; # GEORGIAN MTAVRULI CAPITAL LETTER AEN +1CBE; C; 10FE; # GEORGIAN MTAVRULI CAPITAL LETTER HARD SIGN +1CBF; C; 10FF; # GEORGIAN MTAVRULI CAPITAL LETTER LABIAL SIGN +1E00; C; 1E01; # LATIN CAPITAL LETTER A WITH RING BELOW +1E02; C; 1E03; # LATIN CAPITAL LETTER B WITH DOT ABOVE +1E04; C; 1E05; # LATIN CAPITAL LETTER B WITH DOT BELOW +1E06; C; 1E07; # LATIN CAPITAL LETTER B WITH LINE BELOW +1E08; C; 1E09; # LATIN CAPITAL LETTER C WITH CEDILLA AND ACUTE +1E0A; C; 1E0B; # LATIN CAPITAL LETTER D WITH DOT ABOVE +1E0C; C; 1E0D; # LATIN CAPITAL LETTER D WITH DOT BELOW +1E0E; C; 1E0F; # LATIN CAPITAL LETTER D WITH LINE BELOW +1E10; C; 1E11; # LATIN CAPITAL LETTER D WITH CEDILLA +1E12; C; 1E13; # LATIN CAPITAL LETTER D WITH CIRCUMFLEX BELOW +1E14; C; 1E15; # LATIN CAPITAL LETTER E WITH MACRON AND GRAVE +1E16; C; 1E17; # LATIN CAPITAL LETTER E WITH MACRON AND ACUTE +1E18; C; 1E19; # LATIN CAPITAL LETTER E WITH CIRCUMFLEX BELOW +1E1A; C; 1E1B; # LATIN CAPITAL LETTER E WITH TILDE BELOW +1E1C; C; 1E1D; # LATIN CAPITAL LETTER E WITH CEDILLA AND BREVE +1E1E; C; 1E1F; # LATIN CAPITAL LETTER F WITH DOT ABOVE +1E20; C; 1E21; # LATIN CAPITAL LETTER G WITH MACRON +1E22; C; 1E23; # LATIN CAPITAL LETTER H WITH DOT ABOVE +1E24; C; 1E25; # LATIN CAPITAL LETTER H WITH DOT BELOW +1E26; C; 1E27; # LATIN CAPITAL LETTER H WITH DIAERESIS +1E28; C; 1E29; # LATIN CAPITAL LETTER H WITH CEDILLA +1E2A; C; 1E2B; # LATIN CAPITAL LETTER H WITH BREVE BELOW +1E2C; C; 1E2D; # LATIN CAPITAL LETTER I WITH TILDE BELOW +1E2E; C; 1E2F; # LATIN CAPITAL LETTER I WITH DIAERESIS AND ACUTE +1E30; C; 1E31; # LATIN CAPITAL LETTER K WITH ACUTE +1E32; C; 1E33; # LATIN CAPITAL LETTER K WITH DOT BELOW +1E34; C; 1E35; # LATIN CAPITAL LETTER K WITH LINE BELOW +1E36; C; 1E37; # LATIN CAPITAL LETTER L WITH DOT BELOW +1E38; C; 1E39; # LATIN CAPITAL LETTER L WITH DOT BELOW AND MACRON +1E3A; C; 1E3B; # LATIN CAPITAL LETTER L WITH LINE BELOW +1E3C; C; 1E3D; # LATIN CAPITAL LETTER L WITH CIRCUMFLEX BELOW +1E3E; C; 1E3F; # LATIN CAPITAL LETTER M WITH ACUTE +1E40; C; 1E41; # LATIN CAPITAL LETTER M WITH DOT ABOVE +1E42; C; 1E43; # LATIN CAPITAL LETTER M WITH DOT BELOW +1E44; C; 1E45; # LATIN CAPITAL LETTER N WITH DOT ABOVE +1E46; C; 1E47; # LATIN CAPITAL LETTER N WITH DOT BELOW +1E48; C; 1E49; # LATIN CAPITAL LETTER N WITH LINE BELOW +1E4A; C; 1E4B; # LATIN CAPITAL LETTER N WITH CIRCUMFLEX BELOW +1E4C; C; 1E4D; # LATIN CAPITAL LETTER O WITH TILDE AND ACUTE +1E4E; C; 1E4F; # LATIN CAPITAL LETTER O WITH TILDE AND DIAERESIS +1E50; C; 1E51; # LATIN CAPITAL LETTER O WITH MACRON AND GRAVE +1E52; C; 1E53; # LATIN CAPITAL LETTER O WITH MACRON AND ACUTE +1E54; C; 1E55; # LATIN CAPITAL LETTER P WITH ACUTE +1E56; C; 1E57; # LATIN CAPITAL LETTER P WITH DOT ABOVE +1E58; C; 1E59; # LATIN CAPITAL LETTER R WITH DOT ABOVE +1E5A; C; 1E5B; # LATIN CAPITAL LETTER R WITH DOT BELOW +1E5C; C; 1E5D; # LATIN CAPITAL LETTER R WITH DOT BELOW AND MACRON +1E5E; C; 1E5F; # LATIN CAPITAL LETTER R WITH LINE BELOW +1E60; C; 1E61; # LATIN CAPITAL LETTER S WITH DOT ABOVE +1E62; C; 1E63; # LATIN CAPITAL LETTER S WITH DOT BELOW +1E64; C; 1E65; # LATIN CAPITAL LETTER S WITH ACUTE AND DOT ABOVE +1E66; C; 1E67; # LATIN CAPITAL LETTER S WITH CARON AND DOT ABOVE +1E68; C; 1E69; # LATIN CAPITAL LETTER S WITH DOT BELOW AND DOT ABOVE +1E6A; C; 1E6B; # LATIN CAPITAL LETTER T WITH DOT ABOVE +1E6C; C; 1E6D; # LATIN CAPITAL LETTER T WITH DOT BELOW +1E6E; C; 1E6F; # LATIN CAPITAL LETTER T WITH LINE BELOW +1E70; C; 1E71; # LATIN CAPITAL LETTER T WITH CIRCUMFLEX BELOW +1E72; C; 1E73; # LATIN CAPITAL LETTER U WITH DIAERESIS BELOW +1E74; C; 1E75; # LATIN CAPITAL LETTER U WITH TILDE BELOW +1E76; C; 1E77; # LATIN CAPITAL LETTER U WITH CIRCUMFLEX BELOW +1E78; C; 1E79; # LATIN CAPITAL LETTER U WITH TILDE AND ACUTE +1E7A; C; 1E7B; # LATIN CAPITAL LETTER U WITH MACRON AND DIAERESIS +1E7C; C; 1E7D; # LATIN CAPITAL LETTER V WITH TILDE +1E7E; C; 1E7F; # LATIN CAPITAL LETTER V WITH DOT BELOW +1E80; C; 1E81; # LATIN CAPITAL LETTER W WITH GRAVE +1E82; C; 1E83; # LATIN CAPITAL LETTER W WITH ACUTE +1E84; C; 1E85; # LATIN CAPITAL LETTER W WITH DIAERESIS +1E86; C; 1E87; # LATIN CAPITAL LETTER W WITH DOT ABOVE +1E88; C; 1E89; # LATIN CAPITAL LETTER W WITH DOT BELOW +1E8A; C; 1E8B; # LATIN CAPITAL LETTER X WITH DOT ABOVE +1E8C; C; 1E8D; # LATIN CAPITAL LETTER X WITH DIAERESIS +1E8E; C; 1E8F; # LATIN CAPITAL LETTER Y WITH DOT ABOVE +1E90; C; 1E91; # LATIN CAPITAL LETTER Z WITH CIRCUMFLEX +1E92; C; 1E93; # LATIN CAPITAL LETTER Z WITH DOT BELOW +1E94; C; 1E95; # LATIN CAPITAL LETTER Z WITH LINE BELOW +1E96; F; 0068 0331; # LATIN SMALL LETTER H WITH LINE BELOW +1E97; F; 0074 0308; # LATIN SMALL LETTER T WITH DIAERESIS +1E98; F; 0077 030A; # LATIN SMALL LETTER W WITH RING ABOVE +1E99; F; 0079 030A; # LATIN SMALL LETTER Y WITH RING ABOVE +1E9A; F; 0061 02BE; # LATIN SMALL LETTER A WITH RIGHT HALF RING +1E9B; C; 1E61; # LATIN SMALL LETTER LONG S WITH DOT ABOVE +1E9E; F; 0073 0073; # LATIN CAPITAL LETTER SHARP S +1E9E; S; 00DF; # LATIN CAPITAL LETTER SHARP S +1EA0; C; 1EA1; # LATIN CAPITAL LETTER A WITH DOT BELOW +1EA2; C; 1EA3; # LATIN CAPITAL LETTER A WITH HOOK ABOVE +1EA4; C; 1EA5; # LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND ACUTE +1EA6; C; 1EA7; # LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND GRAVE +1EA8; C; 1EA9; # LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND HOOK ABOVE +1EAA; C; 1EAB; # LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND TILDE +1EAC; C; 1EAD; # LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND DOT BELOW +1EAE; C; 1EAF; # LATIN CAPITAL LETTER A WITH BREVE AND ACUTE +1EB0; C; 1EB1; # LATIN CAPITAL LETTER A WITH BREVE AND GRAVE +1EB2; C; 1EB3; # LATIN CAPITAL LETTER A WITH BREVE AND HOOK ABOVE +1EB4; C; 1EB5; # LATIN CAPITAL LETTER A WITH BREVE AND TILDE +1EB6; C; 1EB7; # LATIN CAPITAL LETTER A WITH BREVE AND DOT BELOW +1EB8; C; 1EB9; # LATIN CAPITAL LETTER E WITH DOT BELOW +1EBA; C; 1EBB; # LATIN CAPITAL LETTER E WITH HOOK ABOVE +1EBC; C; 1EBD; # LATIN CAPITAL LETTER E WITH TILDE +1EBE; C; 1EBF; # LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND ACUTE +1EC0; C; 1EC1; # LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND GRAVE +1EC2; C; 1EC3; # LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND HOOK ABOVE +1EC4; C; 1EC5; # LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND TILDE +1EC6; C; 1EC7; # LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND DOT BELOW +1EC8; C; 1EC9; # LATIN CAPITAL LETTER I WITH HOOK ABOVE +1ECA; C; 1ECB; # LATIN CAPITAL LETTER I WITH DOT BELOW +1ECC; C; 1ECD; # LATIN CAPITAL LETTER O WITH DOT BELOW +1ECE; C; 1ECF; # LATIN CAPITAL LETTER O WITH HOOK ABOVE +1ED0; C; 1ED1; # LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND ACUTE +1ED2; C; 1ED3; # LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND GRAVE +1ED4; C; 1ED5; # LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND HOOK ABOVE +1ED6; C; 1ED7; # LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND TILDE +1ED8; C; 1ED9; # LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND DOT BELOW +1EDA; C; 1EDB; # LATIN CAPITAL LETTER O WITH HORN AND ACUTE +1EDC; C; 1EDD; # LATIN CAPITAL LETTER O WITH HORN AND GRAVE +1EDE; C; 1EDF; # LATIN CAPITAL LETTER O WITH HORN AND HOOK ABOVE +1EE0; C; 1EE1; # LATIN CAPITAL LETTER O WITH HORN AND TILDE +1EE2; C; 1EE3; # LATIN CAPITAL LETTER O WITH HORN AND DOT BELOW +1EE4; C; 1EE5; # LATIN CAPITAL LETTER U WITH DOT BELOW +1EE6; C; 1EE7; # LATIN CAPITAL LETTER U WITH HOOK ABOVE +1EE8; C; 1EE9; # LATIN CAPITAL LETTER U WITH HORN AND ACUTE +1EEA; C; 1EEB; # LATIN CAPITAL LETTER U WITH HORN AND GRAVE +1EEC; C; 1EED; # LATIN CAPITAL LETTER U WITH HORN AND HOOK ABOVE +1EEE; C; 1EEF; # LATIN CAPITAL LETTER U WITH HORN AND TILDE +1EF0; C; 1EF1; # LATIN CAPITAL LETTER U WITH HORN AND DOT BELOW +1EF2; C; 1EF3; # LATIN CAPITAL LETTER Y WITH GRAVE +1EF4; C; 1EF5; # LATIN CAPITAL LETTER Y WITH DOT BELOW +1EF6; C; 1EF7; # LATIN CAPITAL LETTER Y WITH HOOK ABOVE +1EF8; C; 1EF9; # LATIN CAPITAL LETTER Y WITH TILDE +1EFA; C; 1EFB; # LATIN CAPITAL LETTER MIDDLE-WELSH LL +1EFC; C; 1EFD; # LATIN CAPITAL LETTER MIDDLE-WELSH V +1EFE; C; 1EFF; # LATIN CAPITAL LETTER Y WITH LOOP +1F08; C; 1F00; # GREEK CAPITAL LETTER ALPHA WITH PSILI +1F09; C; 1F01; # GREEK CAPITAL LETTER ALPHA WITH DASIA +1F0A; C; 1F02; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND VARIA +1F0B; C; 1F03; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND VARIA +1F0C; C; 1F04; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND OXIA +1F0D; C; 1F05; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND OXIA +1F0E; C; 1F06; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND PERISPOMENI +1F0F; C; 1F07; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND PERISPOMENI +1F18; C; 1F10; # GREEK CAPITAL LETTER EPSILON WITH PSILI +1F19; C; 1F11; # GREEK CAPITAL LETTER EPSILON WITH DASIA +1F1A; C; 1F12; # GREEK CAPITAL LETTER EPSILON WITH PSILI AND VARIA +1F1B; C; 1F13; # GREEK CAPITAL LETTER EPSILON WITH DASIA AND VARIA +1F1C; C; 1F14; # GREEK CAPITAL LETTER EPSILON WITH PSILI AND OXIA +1F1D; C; 1F15; # GREEK CAPITAL LETTER EPSILON WITH DASIA AND OXIA +1F28; C; 1F20; # GREEK CAPITAL LETTER ETA WITH PSILI +1F29; C; 1F21; # GREEK CAPITAL LETTER ETA WITH DASIA +1F2A; C; 1F22; # GREEK CAPITAL LETTER ETA WITH PSILI AND VARIA +1F2B; C; 1F23; # GREEK CAPITAL LETTER ETA WITH DASIA AND VARIA +1F2C; C; 1F24; # GREEK CAPITAL LETTER ETA WITH PSILI AND OXIA +1F2D; C; 1F25; # GREEK CAPITAL LETTER ETA WITH DASIA AND OXIA +1F2E; C; 1F26; # GREEK CAPITAL LETTER ETA WITH PSILI AND PERISPOMENI +1F2F; C; 1F27; # GREEK CAPITAL LETTER ETA WITH DASIA AND PERISPOMENI +1F38; C; 1F30; # GREEK CAPITAL LETTER IOTA WITH PSILI +1F39; C; 1F31; # GREEK CAPITAL LETTER IOTA WITH DASIA +1F3A; C; 1F32; # GREEK CAPITAL LETTER IOTA WITH PSILI AND VARIA +1F3B; C; 1F33; # GREEK CAPITAL LETTER IOTA WITH DASIA AND VARIA +1F3C; C; 1F34; # GREEK CAPITAL LETTER IOTA WITH PSILI AND OXIA +1F3D; C; 1F35; # GREEK CAPITAL LETTER IOTA WITH DASIA AND OXIA +1F3E; C; 1F36; # GREEK CAPITAL LETTER IOTA WITH PSILI AND PERISPOMENI +1F3F; C; 1F37; # GREEK CAPITAL LETTER IOTA WITH DASIA AND PERISPOMENI +1F48; C; 1F40; # GREEK CAPITAL LETTER OMICRON WITH PSILI +1F49; C; 1F41; # GREEK CAPITAL LETTER OMICRON WITH DASIA +1F4A; C; 1F42; # GREEK CAPITAL LETTER OMICRON WITH PSILI AND VARIA +1F4B; C; 1F43; # GREEK CAPITAL LETTER OMICRON WITH DASIA AND VARIA +1F4C; C; 1F44; # GREEK CAPITAL LETTER OMICRON WITH PSILI AND OXIA +1F4D; C; 1F45; # GREEK CAPITAL LETTER OMICRON WITH DASIA AND OXIA +1F50; F; 03C5 0313; # GREEK SMALL LETTER UPSILON WITH PSILI +1F52; F; 03C5 0313 0300; # GREEK SMALL LETTER UPSILON WITH PSILI AND VARIA +1F54; F; 03C5 0313 0301; # GREEK SMALL LETTER UPSILON WITH PSILI AND OXIA +1F56; F; 03C5 0313 0342; # GREEK SMALL LETTER UPSILON WITH PSILI AND PERISPOMENI +1F59; C; 1F51; # GREEK CAPITAL LETTER UPSILON WITH DASIA +1F5B; C; 1F53; # GREEK CAPITAL LETTER UPSILON WITH DASIA AND VARIA +1F5D; C; 1F55; # GREEK CAPITAL LETTER UPSILON WITH DASIA AND OXIA +1F5F; C; 1F57; # GREEK CAPITAL LETTER UPSILON WITH DASIA AND PERISPOMENI +1F68; C; 1F60; # GREEK CAPITAL LETTER OMEGA WITH PSILI +1F69; C; 1F61; # GREEK CAPITAL LETTER OMEGA WITH DASIA +1F6A; C; 1F62; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND VARIA +1F6B; C; 1F63; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND VARIA +1F6C; C; 1F64; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND OXIA +1F6D; C; 1F65; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND OXIA +1F6E; C; 1F66; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND PERISPOMENI +1F6F; C; 1F67; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND PERISPOMENI +1F80; F; 1F00 03B9; # GREEK SMALL LETTER ALPHA WITH PSILI AND YPOGEGRAMMENI +1F81; F; 1F01 03B9; # GREEK SMALL LETTER ALPHA WITH DASIA AND YPOGEGRAMMENI +1F82; F; 1F02 03B9; # GREEK SMALL LETTER ALPHA WITH PSILI AND VARIA AND YPOGEGRAMMENI +1F83; F; 1F03 03B9; # GREEK SMALL LETTER ALPHA WITH DASIA AND VARIA AND YPOGEGRAMMENI +1F84; F; 1F04 03B9; # GREEK SMALL LETTER ALPHA WITH PSILI AND OXIA AND YPOGEGRAMMENI +1F85; F; 1F05 03B9; # GREEK SMALL LETTER ALPHA WITH DASIA AND OXIA AND YPOGEGRAMMENI +1F86; F; 1F06 03B9; # GREEK SMALL LETTER ALPHA WITH PSILI AND PERISPOMENI AND YPOGEGRAMMENI +1F87; F; 1F07 03B9; # GREEK SMALL LETTER ALPHA WITH DASIA AND PERISPOMENI AND YPOGEGRAMMENI +1F88; F; 1F00 03B9; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND PROSGEGRAMMENI +1F88; S; 1F80; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND PROSGEGRAMMENI +1F89; F; 1F01 03B9; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND PROSGEGRAMMENI +1F89; S; 1F81; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND PROSGEGRAMMENI +1F8A; F; 1F02 03B9; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND VARIA AND PROSGEGRAMMENI +1F8A; S; 1F82; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND VARIA AND PROSGEGRAMMENI +1F8B; F; 1F03 03B9; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND VARIA AND PROSGEGRAMMENI +1F8B; S; 1F83; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND VARIA AND PROSGEGRAMMENI +1F8C; F; 1F04 03B9; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND OXIA AND PROSGEGRAMMENI +1F8C; S; 1F84; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND OXIA AND PROSGEGRAMMENI +1F8D; F; 1F05 03B9; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND OXIA AND PROSGEGRAMMENI +1F8D; S; 1F85; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND OXIA AND PROSGEGRAMMENI +1F8E; F; 1F06 03B9; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI +1F8E; S; 1F86; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI +1F8F; F; 1F07 03B9; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI +1F8F; S; 1F87; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI +1F90; F; 1F20 03B9; # GREEK SMALL LETTER ETA WITH PSILI AND YPOGEGRAMMENI +1F91; F; 1F21 03B9; # GREEK SMALL LETTER ETA WITH DASIA AND YPOGEGRAMMENI +1F92; F; 1F22 03B9; # GREEK SMALL LETTER ETA WITH PSILI AND VARIA AND YPOGEGRAMMENI +1F93; F; 1F23 03B9; # GREEK SMALL LETTER ETA WITH DASIA AND VARIA AND YPOGEGRAMMENI +1F94; F; 1F24 03B9; # GREEK SMALL LETTER ETA WITH PSILI AND OXIA AND YPOGEGRAMMENI +1F95; F; 1F25 03B9; # GREEK SMALL LETTER ETA WITH DASIA AND OXIA AND YPOGEGRAMMENI +1F96; F; 1F26 03B9; # GREEK SMALL LETTER ETA WITH PSILI AND PERISPOMENI AND YPOGEGRAMMENI +1F97; F; 1F27 03B9; # GREEK SMALL LETTER ETA WITH DASIA AND PERISPOMENI AND YPOGEGRAMMENI +1F98; F; 1F20 03B9; # GREEK CAPITAL LETTER ETA WITH PSILI AND PROSGEGRAMMENI +1F98; S; 1F90; # GREEK CAPITAL LETTER ETA WITH PSILI AND PROSGEGRAMMENI +1F99; F; 1F21 03B9; # GREEK CAPITAL LETTER ETA WITH DASIA AND PROSGEGRAMMENI +1F99; S; 1F91; # GREEK CAPITAL LETTER ETA WITH DASIA AND PROSGEGRAMMENI +1F9A; F; 1F22 03B9; # GREEK CAPITAL LETTER ETA WITH PSILI AND VARIA AND PROSGEGRAMMENI +1F9A; S; 1F92; # GREEK CAPITAL LETTER ETA WITH PSILI AND VARIA AND PROSGEGRAMMENI +1F9B; F; 1F23 03B9; # GREEK CAPITAL LETTER ETA WITH DASIA AND VARIA AND PROSGEGRAMMENI +1F9B; S; 1F93; # GREEK CAPITAL LETTER ETA WITH DASIA AND VARIA AND PROSGEGRAMMENI +1F9C; F; 1F24 03B9; # GREEK CAPITAL LETTER ETA WITH PSILI AND OXIA AND PROSGEGRAMMENI +1F9C; S; 1F94; # GREEK CAPITAL LETTER ETA WITH PSILI AND OXIA AND PROSGEGRAMMENI +1F9D; F; 1F25 03B9; # GREEK CAPITAL LETTER ETA WITH DASIA AND OXIA AND PROSGEGRAMMENI +1F9D; S; 1F95; # GREEK CAPITAL LETTER ETA WITH DASIA AND OXIA AND PROSGEGRAMMENI +1F9E; F; 1F26 03B9; # GREEK CAPITAL LETTER ETA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI +1F9E; S; 1F96; # GREEK CAPITAL LETTER ETA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI +1F9F; F; 1F27 03B9; # GREEK CAPITAL LETTER ETA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI +1F9F; S; 1F97; # GREEK CAPITAL LETTER ETA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI +1FA0; F; 1F60 03B9; # GREEK SMALL LETTER OMEGA WITH PSILI AND YPOGEGRAMMENI +1FA1; F; 1F61 03B9; # GREEK SMALL LETTER OMEGA WITH DASIA AND YPOGEGRAMMENI +1FA2; F; 1F62 03B9; # GREEK SMALL LETTER OMEGA WITH PSILI AND VARIA AND YPOGEGRAMMENI +1FA3; F; 1F63 03B9; # GREEK SMALL LETTER OMEGA WITH DASIA AND VARIA AND YPOGEGRAMMENI +1FA4; F; 1F64 03B9; # GREEK SMALL LETTER OMEGA WITH PSILI AND OXIA AND YPOGEGRAMMENI +1FA5; F; 1F65 03B9; # GREEK SMALL LETTER OMEGA WITH DASIA AND OXIA AND YPOGEGRAMMENI +1FA6; F; 1F66 03B9; # GREEK SMALL LETTER OMEGA WITH PSILI AND PERISPOMENI AND YPOGEGRAMMENI +1FA7; F; 1F67 03B9; # GREEK SMALL LETTER OMEGA WITH DASIA AND PERISPOMENI AND YPOGEGRAMMENI +1FA8; F; 1F60 03B9; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND PROSGEGRAMMENI +1FA8; S; 1FA0; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND PROSGEGRAMMENI +1FA9; F; 1F61 03B9; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND PROSGEGRAMMENI +1FA9; S; 1FA1; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND PROSGEGRAMMENI +1FAA; F; 1F62 03B9; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND VARIA AND PROSGEGRAMMENI +1FAA; S; 1FA2; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND VARIA AND PROSGEGRAMMENI +1FAB; F; 1F63 03B9; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND VARIA AND PROSGEGRAMMENI +1FAB; S; 1FA3; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND VARIA AND PROSGEGRAMMENI +1FAC; F; 1F64 03B9; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND OXIA AND PROSGEGRAMMENI +1FAC; S; 1FA4; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND OXIA AND PROSGEGRAMMENI +1FAD; F; 1F65 03B9; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND OXIA AND PROSGEGRAMMENI +1FAD; S; 1FA5; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND OXIA AND PROSGEGRAMMENI +1FAE; F; 1F66 03B9; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI +1FAE; S; 1FA6; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI +1FAF; F; 1F67 03B9; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI +1FAF; S; 1FA7; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI +1FB2; F; 1F70 03B9; # GREEK SMALL LETTER ALPHA WITH VARIA AND YPOGEGRAMMENI +1FB3; F; 03B1 03B9; # GREEK SMALL LETTER ALPHA WITH YPOGEGRAMMENI +1FB4; F; 03AC 03B9; # GREEK SMALL LETTER ALPHA WITH OXIA AND YPOGEGRAMMENI +1FB6; F; 03B1 0342; # GREEK SMALL LETTER ALPHA WITH PERISPOMENI +1FB7; F; 03B1 0342 03B9; # GREEK SMALL LETTER ALPHA WITH PERISPOMENI AND YPOGEGRAMMENI +1FB8; C; 1FB0; # GREEK CAPITAL LETTER ALPHA WITH VRACHY +1FB9; C; 1FB1; # GREEK CAPITAL LETTER ALPHA WITH MACRON +1FBA; C; 1F70; # GREEK CAPITAL LETTER ALPHA WITH VARIA +1FBB; C; 1F71; # GREEK CAPITAL LETTER ALPHA WITH OXIA +1FBC; F; 03B1 03B9; # GREEK CAPITAL LETTER ALPHA WITH PROSGEGRAMMENI +1FBC; S; 1FB3; # GREEK CAPITAL LETTER ALPHA WITH PROSGEGRAMMENI +1FBE; C; 03B9; # GREEK PROSGEGRAMMENI +1FC2; F; 1F74 03B9; # GREEK SMALL LETTER ETA WITH VARIA AND YPOGEGRAMMENI +1FC3; F; 03B7 03B9; # GREEK SMALL LETTER ETA WITH YPOGEGRAMMENI +1FC4; F; 03AE 03B9; # GREEK SMALL LETTER ETA WITH OXIA AND YPOGEGRAMMENI +1FC6; F; 03B7 0342; # GREEK SMALL LETTER ETA WITH PERISPOMENI +1FC7; F; 03B7 0342 03B9; # GREEK SMALL LETTER ETA WITH PERISPOMENI AND YPOGEGRAMMENI +1FC8; C; 1F72; # GREEK CAPITAL LETTER EPSILON WITH VARIA +1FC9; C; 1F73; # GREEK CAPITAL LETTER EPSILON WITH OXIA +1FCA; C; 1F74; # GREEK CAPITAL LETTER ETA WITH VARIA +1FCB; C; 1F75; # GREEK CAPITAL LETTER ETA WITH OXIA +1FCC; F; 03B7 03B9; # GREEK CAPITAL LETTER ETA WITH PROSGEGRAMMENI +1FCC; S; 1FC3; # GREEK CAPITAL LETTER ETA WITH PROSGEGRAMMENI +1FD2; F; 03B9 0308 0300; # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND VARIA +1FD3; F; 03B9 0308 0301; # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA +1FD3; S; 0390; # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA +1FD6; F; 03B9 0342; # GREEK SMALL LETTER IOTA WITH PERISPOMENI +1FD7; F; 03B9 0308 0342; # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND PERISPOMENI +1FD8; C; 1FD0; # GREEK CAPITAL LETTER IOTA WITH VRACHY +1FD9; C; 1FD1; # GREEK CAPITAL LETTER IOTA WITH MACRON +1FDA; C; 1F76; # GREEK CAPITAL LETTER IOTA WITH VARIA +1FDB; C; 1F77; # GREEK CAPITAL LETTER IOTA WITH OXIA +1FE2; F; 03C5 0308 0300; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND VARIA +1FE3; F; 03C5 0308 0301; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA +1FE3; S; 03B0; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA +1FE4; F; 03C1 0313; # GREEK SMALL LETTER RHO WITH PSILI +1FE6; F; 03C5 0342; # GREEK SMALL LETTER UPSILON WITH PERISPOMENI +1FE7; F; 03C5 0308 0342; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND PERISPOMENI +1FE8; C; 1FE0; # GREEK CAPITAL LETTER UPSILON WITH VRACHY +1FE9; C; 1FE1; # GREEK CAPITAL LETTER UPSILON WITH MACRON +1FEA; C; 1F7A; # GREEK CAPITAL LETTER UPSILON WITH VARIA +1FEB; C; 1F7B; # GREEK CAPITAL LETTER UPSILON WITH OXIA +1FEC; C; 1FE5; # GREEK CAPITAL LETTER RHO WITH DASIA +1FF2; F; 1F7C 03B9; # GREEK SMALL LETTER OMEGA WITH VARIA AND YPOGEGRAMMENI +1FF3; F; 03C9 03B9; # GREEK SMALL LETTER OMEGA WITH YPOGEGRAMMENI +1FF4; F; 03CE 03B9; # GREEK SMALL LETTER OMEGA WITH OXIA AND YPOGEGRAMMENI +1FF6; F; 03C9 0342; # GREEK SMALL LETTER OMEGA WITH PERISPOMENI +1FF7; F; 03C9 0342 03B9; # GREEK SMALL LETTER OMEGA WITH PERISPOMENI AND YPOGEGRAMMENI +1FF8; C; 1F78; # GREEK CAPITAL LETTER OMICRON WITH VARIA +1FF9; C; 1F79; # GREEK CAPITAL LETTER OMICRON WITH OXIA +1FFA; C; 1F7C; # GREEK CAPITAL LETTER OMEGA WITH VARIA +1FFB; C; 1F7D; # GREEK CAPITAL LETTER OMEGA WITH OXIA +1FFC; F; 03C9 03B9; # GREEK CAPITAL LETTER OMEGA WITH PROSGEGRAMMENI +1FFC; S; 1FF3; # GREEK CAPITAL LETTER OMEGA WITH PROSGEGRAMMENI +2126; C; 03C9; # OHM SIGN +212A; C; 006B; # KELVIN SIGN +212B; C; 00E5; # ANGSTROM SIGN +2132; C; 214E; # TURNED CAPITAL F +2160; C; 2170; # ROMAN NUMERAL ONE +2161; C; 2171; # ROMAN NUMERAL TWO +2162; C; 2172; # ROMAN NUMERAL THREE +2163; C; 2173; # ROMAN NUMERAL FOUR +2164; C; 2174; # ROMAN NUMERAL FIVE +2165; C; 2175; # ROMAN NUMERAL SIX +2166; C; 2176; # ROMAN NUMERAL SEVEN +2167; C; 2177; # ROMAN NUMERAL EIGHT +2168; C; 2178; # ROMAN NUMERAL NINE +2169; C; 2179; # ROMAN NUMERAL TEN +216A; C; 217A; # ROMAN NUMERAL ELEVEN +216B; C; 217B; # ROMAN NUMERAL TWELVE +216C; C; 217C; # ROMAN NUMERAL FIFTY +216D; C; 217D; # ROMAN NUMERAL ONE HUNDRED +216E; C; 217E; # ROMAN NUMERAL FIVE HUNDRED +216F; C; 217F; # ROMAN NUMERAL ONE THOUSAND +2183; C; 2184; # ROMAN NUMERAL REVERSED ONE HUNDRED +24B6; C; 24D0; # CIRCLED LATIN CAPITAL LETTER A +24B7; C; 24D1; # CIRCLED LATIN CAPITAL LETTER B +24B8; C; 24D2; # CIRCLED LATIN CAPITAL LETTER C +24B9; C; 24D3; # CIRCLED LATIN CAPITAL LETTER D +24BA; C; 24D4; # CIRCLED LATIN CAPITAL LETTER E +24BB; C; 24D5; # CIRCLED LATIN CAPITAL LETTER F +24BC; C; 24D6; # CIRCLED LATIN CAPITAL LETTER G +24BD; C; 24D7; # CIRCLED LATIN CAPITAL LETTER H +24BE; C; 24D8; # CIRCLED LATIN CAPITAL LETTER I +24BF; C; 24D9; # CIRCLED LATIN CAPITAL LETTER J +24C0; C; 24DA; # CIRCLED LATIN CAPITAL LETTER K +24C1; C; 24DB; # CIRCLED LATIN CAPITAL LETTER L +24C2; C; 24DC; # CIRCLED LATIN CAPITAL LETTER M +24C3; C; 24DD; # CIRCLED LATIN CAPITAL LETTER N +24C4; C; 24DE; # CIRCLED LATIN CAPITAL LETTER O +24C5; C; 24DF; # CIRCLED LATIN CAPITAL LETTER P +24C6; C; 24E0; # CIRCLED LATIN CAPITAL LETTER Q +24C7; C; 24E1; # CIRCLED LATIN CAPITAL LETTER R +24C8; C; 24E2; # CIRCLED LATIN CAPITAL LETTER S +24C9; C; 24E3; # CIRCLED LATIN CAPITAL LETTER T +24CA; C; 24E4; # CIRCLED LATIN CAPITAL LETTER U +24CB; C; 24E5; # CIRCLED LATIN CAPITAL LETTER V +24CC; C; 24E6; # CIRCLED LATIN CAPITAL LETTER W +24CD; C; 24E7; # CIRCLED LATIN CAPITAL LETTER X +24CE; C; 24E8; # CIRCLED LATIN CAPITAL LETTER Y +24CF; C; 24E9; # CIRCLED LATIN CAPITAL LETTER Z +2C00; C; 2C30; # GLAGOLITIC CAPITAL LETTER AZU +2C01; C; 2C31; # GLAGOLITIC CAPITAL LETTER BUKY +2C02; C; 2C32; # GLAGOLITIC CAPITAL LETTER VEDE +2C03; C; 2C33; # GLAGOLITIC CAPITAL LETTER GLAGOLI +2C04; C; 2C34; # GLAGOLITIC CAPITAL LETTER DOBRO +2C05; C; 2C35; # GLAGOLITIC CAPITAL LETTER YESTU +2C06; C; 2C36; # GLAGOLITIC CAPITAL LETTER ZHIVETE +2C07; C; 2C37; # GLAGOLITIC CAPITAL LETTER DZELO +2C08; C; 2C38; # GLAGOLITIC CAPITAL LETTER ZEMLJA +2C09; C; 2C39; # GLAGOLITIC CAPITAL LETTER IZHE +2C0A; C; 2C3A; # GLAGOLITIC CAPITAL LETTER INITIAL IZHE +2C0B; C; 2C3B; # GLAGOLITIC CAPITAL LETTER I +2C0C; C; 2C3C; # GLAGOLITIC CAPITAL LETTER DJERVI +2C0D; C; 2C3D; # GLAGOLITIC CAPITAL LETTER KAKO +2C0E; C; 2C3E; # GLAGOLITIC CAPITAL LETTER LJUDIJE +2C0F; C; 2C3F; # GLAGOLITIC CAPITAL LETTER MYSLITE +2C10; C; 2C40; # GLAGOLITIC CAPITAL LETTER NASHI +2C11; C; 2C41; # GLAGOLITIC CAPITAL LETTER ONU +2C12; C; 2C42; # GLAGOLITIC CAPITAL LETTER POKOJI +2C13; C; 2C43; # GLAGOLITIC CAPITAL LETTER RITSI +2C14; C; 2C44; # GLAGOLITIC CAPITAL LETTER SLOVO +2C15; C; 2C45; # GLAGOLITIC CAPITAL LETTER TVRIDO +2C16; C; 2C46; # GLAGOLITIC CAPITAL LETTER UKU +2C17; C; 2C47; # GLAGOLITIC CAPITAL LETTER FRITU +2C18; C; 2C48; # GLAGOLITIC CAPITAL LETTER HERU +2C19; C; 2C49; # GLAGOLITIC CAPITAL LETTER OTU +2C1A; C; 2C4A; # GLAGOLITIC CAPITAL LETTER PE +2C1B; C; 2C4B; # GLAGOLITIC CAPITAL LETTER SHTA +2C1C; C; 2C4C; # GLAGOLITIC CAPITAL LETTER TSI +2C1D; C; 2C4D; # GLAGOLITIC CAPITAL LETTER CHRIVI +2C1E; C; 2C4E; # GLAGOLITIC CAPITAL LETTER SHA +2C1F; C; 2C4F; # GLAGOLITIC CAPITAL LETTER YERU +2C20; C; 2C50; # GLAGOLITIC CAPITAL LETTER YERI +2C21; C; 2C51; # GLAGOLITIC CAPITAL LETTER YATI +2C22; C; 2C52; # GLAGOLITIC CAPITAL LETTER SPIDERY HA +2C23; C; 2C53; # GLAGOLITIC CAPITAL LETTER YU +2C24; C; 2C54; # GLAGOLITIC CAPITAL LETTER SMALL YUS +2C25; C; 2C55; # GLAGOLITIC CAPITAL LETTER SMALL YUS WITH TAIL +2C26; C; 2C56; # GLAGOLITIC CAPITAL LETTER YO +2C27; C; 2C57; # GLAGOLITIC CAPITAL LETTER IOTATED SMALL YUS +2C28; C; 2C58; # GLAGOLITIC CAPITAL LETTER BIG YUS +2C29; C; 2C59; # GLAGOLITIC CAPITAL LETTER IOTATED BIG YUS +2C2A; C; 2C5A; # GLAGOLITIC CAPITAL LETTER FITA +2C2B; C; 2C5B; # GLAGOLITIC CAPITAL LETTER IZHITSA +2C2C; C; 2C5C; # GLAGOLITIC CAPITAL LETTER SHTAPIC +2C2D; C; 2C5D; # GLAGOLITIC CAPITAL LETTER TROKUTASTI A +2C2E; C; 2C5E; # GLAGOLITIC CAPITAL LETTER LATINATE MYSLITE +2C2F; C; 2C5F; # GLAGOLITIC CAPITAL LETTER CAUDATE CHRIVI +2C60; C; 2C61; # LATIN CAPITAL LETTER L WITH DOUBLE BAR +2C62; C; 026B; # LATIN CAPITAL LETTER L WITH MIDDLE TILDE +2C63; C; 1D7D; # LATIN CAPITAL LETTER P WITH STROKE +2C64; C; 027D; # LATIN CAPITAL LETTER R WITH TAIL +2C67; C; 2C68; # LATIN CAPITAL LETTER H WITH DESCENDER +2C69; C; 2C6A; # LATIN CAPITAL LETTER K WITH DESCENDER +2C6B; C; 2C6C; # LATIN CAPITAL LETTER Z WITH DESCENDER +2C6D; C; 0251; # LATIN CAPITAL LETTER ALPHA +2C6E; C; 0271; # LATIN CAPITAL LETTER M WITH HOOK +2C6F; C; 0250; # LATIN CAPITAL LETTER TURNED A +2C70; C; 0252; # LATIN CAPITAL LETTER TURNED ALPHA +2C72; C; 2C73; # LATIN CAPITAL LETTER W WITH HOOK +2C75; C; 2C76; # LATIN CAPITAL LETTER HALF H +2C7E; C; 023F; # LATIN CAPITAL LETTER S WITH SWASH TAIL +2C7F; C; 0240; # LATIN CAPITAL LETTER Z WITH SWASH TAIL +2C80; C; 2C81; # COPTIC CAPITAL LETTER ALFA +2C82; C; 2C83; # COPTIC CAPITAL LETTER VIDA +2C84; C; 2C85; # COPTIC CAPITAL LETTER GAMMA +2C86; C; 2C87; # COPTIC CAPITAL LETTER DALDA +2C88; C; 2C89; # COPTIC CAPITAL LETTER EIE +2C8A; C; 2C8B; # COPTIC CAPITAL LETTER SOU +2C8C; C; 2C8D; # COPTIC CAPITAL LETTER ZATA +2C8E; C; 2C8F; # COPTIC CAPITAL LETTER HATE +2C90; C; 2C91; # COPTIC CAPITAL LETTER THETHE +2C92; C; 2C93; # COPTIC CAPITAL LETTER IAUDA +2C94; C; 2C95; # COPTIC CAPITAL LETTER KAPA +2C96; C; 2C97; # COPTIC CAPITAL LETTER LAULA +2C98; C; 2C99; # COPTIC CAPITAL LETTER MI +2C9A; C; 2C9B; # COPTIC CAPITAL LETTER NI +2C9C; C; 2C9D; # COPTIC CAPITAL LETTER KSI +2C9E; C; 2C9F; # COPTIC CAPITAL LETTER O +2CA0; C; 2CA1; # COPTIC CAPITAL LETTER PI +2CA2; C; 2CA3; # COPTIC CAPITAL LETTER RO +2CA4; C; 2CA5; # COPTIC CAPITAL LETTER SIMA +2CA6; C; 2CA7; # COPTIC CAPITAL LETTER TAU +2CA8; C; 2CA9; # COPTIC CAPITAL LETTER UA +2CAA; C; 2CAB; # COPTIC CAPITAL LETTER FI +2CAC; C; 2CAD; # COPTIC CAPITAL LETTER KHI +2CAE; C; 2CAF; # COPTIC CAPITAL LETTER PSI +2CB0; C; 2CB1; # COPTIC CAPITAL LETTER OOU +2CB2; C; 2CB3; # COPTIC CAPITAL LETTER DIALECT-P ALEF +2CB4; C; 2CB5; # COPTIC CAPITAL LETTER OLD COPTIC AIN +2CB6; C; 2CB7; # COPTIC CAPITAL LETTER CRYPTOGRAMMIC EIE +2CB8; C; 2CB9; # COPTIC CAPITAL LETTER DIALECT-P KAPA +2CBA; C; 2CBB; # COPTIC CAPITAL LETTER DIALECT-P NI +2CBC; C; 2CBD; # COPTIC CAPITAL LETTER CRYPTOGRAMMIC NI +2CBE; C; 2CBF; # COPTIC CAPITAL LETTER OLD COPTIC OOU +2CC0; C; 2CC1; # COPTIC CAPITAL LETTER SAMPI +2CC2; C; 2CC3; # COPTIC CAPITAL LETTER CROSSED SHEI +2CC4; C; 2CC5; # COPTIC CAPITAL LETTER OLD COPTIC SHEI +2CC6; C; 2CC7; # COPTIC CAPITAL LETTER OLD COPTIC ESH +2CC8; C; 2CC9; # COPTIC CAPITAL LETTER AKHMIMIC KHEI +2CCA; C; 2CCB; # COPTIC CAPITAL LETTER DIALECT-P HORI +2CCC; C; 2CCD; # COPTIC CAPITAL LETTER OLD COPTIC HORI +2CCE; C; 2CCF; # COPTIC CAPITAL LETTER OLD COPTIC HA +2CD0; C; 2CD1; # COPTIC CAPITAL LETTER L-SHAPED HA +2CD2; C; 2CD3; # COPTIC CAPITAL LETTER OLD COPTIC HEI +2CD4; C; 2CD5; # COPTIC CAPITAL LETTER OLD COPTIC HAT +2CD6; C; 2CD7; # COPTIC CAPITAL LETTER OLD COPTIC GANGIA +2CD8; C; 2CD9; # COPTIC CAPITAL LETTER OLD COPTIC DJA +2CDA; C; 2CDB; # COPTIC CAPITAL LETTER OLD COPTIC SHIMA +2CDC; C; 2CDD; # COPTIC CAPITAL LETTER OLD NUBIAN SHIMA +2CDE; C; 2CDF; # COPTIC CAPITAL LETTER OLD NUBIAN NGI +2CE0; C; 2CE1; # COPTIC CAPITAL LETTER OLD NUBIAN NYI +2CE2; C; 2CE3; # COPTIC CAPITAL LETTER OLD NUBIAN WAU +2CEB; C; 2CEC; # COPTIC CAPITAL LETTER CRYPTOGRAMMIC SHEI +2CED; C; 2CEE; # COPTIC CAPITAL LETTER CRYPTOGRAMMIC GANGIA +2CF2; C; 2CF3; # COPTIC CAPITAL LETTER BOHAIRIC KHEI +A640; C; A641; # CYRILLIC CAPITAL LETTER ZEMLYA +A642; C; A643; # CYRILLIC CAPITAL LETTER DZELO +A644; C; A645; # CYRILLIC CAPITAL LETTER REVERSED DZE +A646; C; A647; # CYRILLIC CAPITAL LETTER IOTA +A648; C; A649; # CYRILLIC CAPITAL LETTER DJERV +A64A; C; A64B; # CYRILLIC CAPITAL LETTER MONOGRAPH UK +A64C; C; A64D; # CYRILLIC CAPITAL LETTER BROAD OMEGA +A64E; C; A64F; # CYRILLIC CAPITAL LETTER NEUTRAL YER +A650; C; A651; # CYRILLIC CAPITAL LETTER YERU WITH BACK YER +A652; C; A653; # CYRILLIC CAPITAL LETTER IOTIFIED YAT +A654; C; A655; # CYRILLIC CAPITAL LETTER REVERSED YU +A656; C; A657; # CYRILLIC CAPITAL LETTER IOTIFIED A +A658; C; A659; # CYRILLIC CAPITAL LETTER CLOSED LITTLE YUS +A65A; C; A65B; # CYRILLIC CAPITAL LETTER BLENDED YUS +A65C; C; A65D; # CYRILLIC CAPITAL LETTER IOTIFIED CLOSED LITTLE YUS +A65E; C; A65F; # CYRILLIC CAPITAL LETTER YN +A660; C; A661; # CYRILLIC CAPITAL LETTER REVERSED TSE +A662; C; A663; # CYRILLIC CAPITAL LETTER SOFT DE +A664; C; A665; # CYRILLIC CAPITAL LETTER SOFT EL +A666; C; A667; # CYRILLIC CAPITAL LETTER SOFT EM +A668; C; A669; # CYRILLIC CAPITAL LETTER MONOCULAR O +A66A; C; A66B; # CYRILLIC CAPITAL LETTER BINOCULAR O +A66C; C; A66D; # CYRILLIC CAPITAL LETTER DOUBLE MONOCULAR O +A680; C; A681; # CYRILLIC CAPITAL LETTER DWE +A682; C; A683; # CYRILLIC CAPITAL LETTER DZWE +A684; C; A685; # CYRILLIC CAPITAL LETTER ZHWE +A686; C; A687; # CYRILLIC CAPITAL LETTER CCHE +A688; C; A689; # CYRILLIC CAPITAL LETTER DZZE +A68A; C; A68B; # CYRILLIC CAPITAL LETTER TE WITH MIDDLE HOOK +A68C; C; A68D; # CYRILLIC CAPITAL LETTER TWE +A68E; C; A68F; # CYRILLIC CAPITAL LETTER TSWE +A690; C; A691; # CYRILLIC CAPITAL LETTER TSSE +A692; C; A693; # CYRILLIC CAPITAL LETTER TCHE +A694; C; A695; # CYRILLIC CAPITAL LETTER HWE +A696; C; A697; # CYRILLIC CAPITAL LETTER SHWE +A698; C; A699; # CYRILLIC CAPITAL LETTER DOUBLE O +A69A; C; A69B; # CYRILLIC CAPITAL LETTER CROSSED O +A722; C; A723; # LATIN CAPITAL LETTER EGYPTOLOGICAL ALEF +A724; C; A725; # LATIN CAPITAL LETTER EGYPTOLOGICAL AIN +A726; C; A727; # LATIN CAPITAL LETTER HENG +A728; C; A729; # LATIN CAPITAL LETTER TZ +A72A; C; A72B; # LATIN CAPITAL LETTER TRESILLO +A72C; C; A72D; # LATIN CAPITAL LETTER CUATRILLO +A72E; C; A72F; # LATIN CAPITAL LETTER CUATRILLO WITH COMMA +A732; C; A733; # LATIN CAPITAL LETTER AA +A734; C; A735; # LATIN CAPITAL LETTER AO +A736; C; A737; # LATIN CAPITAL LETTER AU +A738; C; A739; # LATIN CAPITAL LETTER AV +A73A; C; A73B; # LATIN CAPITAL LETTER AV WITH HORIZONTAL BAR +A73C; C; A73D; # LATIN CAPITAL LETTER AY +A73E; C; A73F; # LATIN CAPITAL LETTER REVERSED C WITH DOT +A740; C; A741; # LATIN CAPITAL LETTER K WITH STROKE +A742; C; A743; # LATIN CAPITAL LETTER K WITH DIAGONAL STROKE +A744; C; A745; # LATIN CAPITAL LETTER K WITH STROKE AND DIAGONAL STROKE +A746; C; A747; # LATIN CAPITAL LETTER BROKEN L +A748; C; A749; # LATIN CAPITAL LETTER L WITH HIGH STROKE +A74A; C; A74B; # LATIN CAPITAL LETTER O WITH LONG STROKE OVERLAY +A74C; C; A74D; # LATIN CAPITAL LETTER O WITH LOOP +A74E; C; A74F; # LATIN CAPITAL LETTER OO +A750; C; A751; # LATIN CAPITAL LETTER P WITH STROKE THROUGH DESCENDER +A752; C; A753; # LATIN CAPITAL LETTER P WITH FLOURISH +A754; C; A755; # LATIN CAPITAL LETTER P WITH SQUIRREL TAIL +A756; C; A757; # LATIN CAPITAL LETTER Q WITH STROKE THROUGH DESCENDER +A758; C; A759; # LATIN CAPITAL LETTER Q WITH DIAGONAL STROKE +A75A; C; A75B; # LATIN CAPITAL LETTER R ROTUNDA +A75C; C; A75D; # LATIN CAPITAL LETTER RUM ROTUNDA +A75E; C; A75F; # LATIN CAPITAL LETTER V WITH DIAGONAL STROKE +A760; C; A761; # LATIN CAPITAL LETTER VY +A762; C; A763; # LATIN CAPITAL LETTER VISIGOTHIC Z +A764; C; A765; # LATIN CAPITAL LETTER THORN WITH STROKE +A766; C; A767; # LATIN CAPITAL LETTER THORN WITH STROKE THROUGH DESCENDER +A768; C; A769; # LATIN CAPITAL LETTER VEND +A76A; C; A76B; # LATIN CAPITAL LETTER ET +A76C; C; A76D; # LATIN CAPITAL LETTER IS +A76E; C; A76F; # LATIN CAPITAL LETTER CON +A779; C; A77A; # LATIN CAPITAL LETTER INSULAR D +A77B; C; A77C; # LATIN CAPITAL LETTER INSULAR F +A77D; C; 1D79; # LATIN CAPITAL LETTER INSULAR G +A77E; C; A77F; # LATIN CAPITAL LETTER TURNED INSULAR G +A780; C; A781; # LATIN CAPITAL LETTER TURNED L +A782; C; A783; # LATIN CAPITAL LETTER INSULAR R +A784; C; A785; # LATIN CAPITAL LETTER INSULAR S +A786; C; A787; # LATIN CAPITAL LETTER INSULAR T +A78B; C; A78C; # LATIN CAPITAL LETTER SALTILLO +A78D; C; 0265; # LATIN CAPITAL LETTER TURNED H +A790; C; A791; # LATIN CAPITAL LETTER N WITH DESCENDER +A792; C; A793; # LATIN CAPITAL LETTER C WITH BAR +A796; C; A797; # LATIN CAPITAL LETTER B WITH FLOURISH +A798; C; A799; # LATIN CAPITAL LETTER F WITH STROKE +A79A; C; A79B; # LATIN CAPITAL LETTER VOLAPUK AE +A79C; C; A79D; # LATIN CAPITAL LETTER VOLAPUK OE +A79E; C; A79F; # LATIN CAPITAL LETTER VOLAPUK UE +A7A0; C; A7A1; # LATIN CAPITAL LETTER G WITH OBLIQUE STROKE +A7A2; C; A7A3; # LATIN CAPITAL LETTER K WITH OBLIQUE STROKE +A7A4; C; A7A5; # LATIN CAPITAL LETTER N WITH OBLIQUE STROKE +A7A6; C; A7A7; # LATIN CAPITAL LETTER R WITH OBLIQUE STROKE +A7A8; C; A7A9; # LATIN CAPITAL LETTER S WITH OBLIQUE STROKE +A7AA; C; 0266; # LATIN CAPITAL LETTER H WITH HOOK +A7AB; C; 025C; # LATIN CAPITAL LETTER REVERSED OPEN E +A7AC; C; 0261; # LATIN CAPITAL LETTER SCRIPT G +A7AD; C; 026C; # LATIN CAPITAL LETTER L WITH BELT +A7AE; C; 026A; # LATIN CAPITAL LETTER SMALL CAPITAL I +A7B0; C; 029E; # LATIN CAPITAL LETTER TURNED K +A7B1; C; 0287; # LATIN CAPITAL LETTER TURNED T +A7B2; C; 029D; # LATIN CAPITAL LETTER J WITH CROSSED-TAIL +A7B3; C; AB53; # LATIN CAPITAL LETTER CHI +A7B4; C; A7B5; # LATIN CAPITAL LETTER BETA +A7B6; C; A7B7; # LATIN CAPITAL LETTER OMEGA +A7B8; C; A7B9; # LATIN CAPITAL LETTER U WITH STROKE +A7BA; C; A7BB; # LATIN CAPITAL LETTER GLOTTAL A +A7BC; C; A7BD; # LATIN CAPITAL LETTER GLOTTAL I +A7BE; C; A7BF; # LATIN CAPITAL LETTER GLOTTAL U +A7C0; C; A7C1; # LATIN CAPITAL LETTER OLD POLISH O +A7C2; C; A7C3; # LATIN CAPITAL LETTER ANGLICANA W +A7C4; C; A794; # LATIN CAPITAL LETTER C WITH PALATAL HOOK +A7C5; C; 0282; # LATIN CAPITAL LETTER S WITH HOOK +A7C6; C; 1D8E; # LATIN CAPITAL LETTER Z WITH PALATAL HOOK +A7C7; C; A7C8; # LATIN CAPITAL LETTER D WITH SHORT STROKE OVERLAY +A7C9; C; A7CA; # LATIN CAPITAL LETTER S WITH SHORT STROKE OVERLAY +A7CB; C; 0264; # LATIN CAPITAL LETTER RAMS HORN +A7CC; C; A7CD; # LATIN CAPITAL LETTER S WITH DIAGONAL STROKE +A7D0; C; A7D1; # LATIN CAPITAL LETTER CLOSED INSULAR G +A7D6; C; A7D7; # LATIN CAPITAL LETTER MIDDLE SCOTS S +A7D8; C; A7D9; # LATIN CAPITAL LETTER SIGMOID S +A7DA; C; A7DB; # LATIN CAPITAL LETTER LAMBDA +A7DC; C; 019B; # LATIN CAPITAL LETTER LAMBDA WITH STROKE +A7F5; C; A7F6; # LATIN CAPITAL LETTER REVERSED HALF H +AB70; C; 13A0; # CHEROKEE SMALL LETTER A +AB71; C; 13A1; # CHEROKEE SMALL LETTER E +AB72; C; 13A2; # CHEROKEE SMALL LETTER I +AB73; C; 13A3; # CHEROKEE SMALL LETTER O +AB74; C; 13A4; # CHEROKEE SMALL LETTER U +AB75; C; 13A5; # CHEROKEE SMALL LETTER V +AB76; C; 13A6; # CHEROKEE SMALL LETTER GA +AB77; C; 13A7; # CHEROKEE SMALL LETTER KA +AB78; C; 13A8; # CHEROKEE SMALL LETTER GE +AB79; C; 13A9; # CHEROKEE SMALL LETTER GI +AB7A; C; 13AA; # CHEROKEE SMALL LETTER GO +AB7B; C; 13AB; # CHEROKEE SMALL LETTER GU +AB7C; C; 13AC; # CHEROKEE SMALL LETTER GV +AB7D; C; 13AD; # CHEROKEE SMALL LETTER HA +AB7E; C; 13AE; # CHEROKEE SMALL LETTER HE +AB7F; C; 13AF; # CHEROKEE SMALL LETTER HI +AB80; C; 13B0; # CHEROKEE SMALL LETTER HO +AB81; C; 13B1; # CHEROKEE SMALL LETTER HU +AB82; C; 13B2; # CHEROKEE SMALL LETTER HV +AB83; C; 13B3; # CHEROKEE SMALL LETTER LA +AB84; C; 13B4; # CHEROKEE SMALL LETTER LE +AB85; C; 13B5; # CHEROKEE SMALL LETTER LI +AB86; C; 13B6; # CHEROKEE SMALL LETTER LO +AB87; C; 13B7; # CHEROKEE SMALL LETTER LU +AB88; C; 13B8; # CHEROKEE SMALL LETTER LV +AB89; C; 13B9; # CHEROKEE SMALL LETTER MA +AB8A; C; 13BA; # CHEROKEE SMALL LETTER ME +AB8B; C; 13BB; # CHEROKEE SMALL LETTER MI +AB8C; C; 13BC; # CHEROKEE SMALL LETTER MO +AB8D; C; 13BD; # CHEROKEE SMALL LETTER MU +AB8E; C; 13BE; # CHEROKEE SMALL LETTER NA +AB8F; C; 13BF; # CHEROKEE SMALL LETTER HNA +AB90; C; 13C0; # CHEROKEE SMALL LETTER NAH +AB91; C; 13C1; # CHEROKEE SMALL LETTER NE +AB92; C; 13C2; # CHEROKEE SMALL LETTER NI +AB93; C; 13C3; # CHEROKEE SMALL LETTER NO +AB94; C; 13C4; # CHEROKEE SMALL LETTER NU +AB95; C; 13C5; # CHEROKEE SMALL LETTER NV +AB96; C; 13C6; # CHEROKEE SMALL LETTER QUA +AB97; C; 13C7; # CHEROKEE SMALL LETTER QUE +AB98; C; 13C8; # CHEROKEE SMALL LETTER QUI +AB99; C; 13C9; # CHEROKEE SMALL LETTER QUO +AB9A; C; 13CA; # CHEROKEE SMALL LETTER QUU +AB9B; C; 13CB; # CHEROKEE SMALL LETTER QUV +AB9C; C; 13CC; # CHEROKEE SMALL LETTER SA +AB9D; C; 13CD; # CHEROKEE SMALL LETTER S +AB9E; C; 13CE; # CHEROKEE SMALL LETTER SE +AB9F; C; 13CF; # CHEROKEE SMALL LETTER SI +ABA0; C; 13D0; # CHEROKEE SMALL LETTER SO +ABA1; C; 13D1; # CHEROKEE SMALL LETTER SU +ABA2; C; 13D2; # CHEROKEE SMALL LETTER SV +ABA3; C; 13D3; # CHEROKEE SMALL LETTER DA +ABA4; C; 13D4; # CHEROKEE SMALL LETTER TA +ABA5; C; 13D5; # CHEROKEE SMALL LETTER DE +ABA6; C; 13D6; # CHEROKEE SMALL LETTER TE +ABA7; C; 13D7; # CHEROKEE SMALL LETTER DI +ABA8; C; 13D8; # CHEROKEE SMALL LETTER TI +ABA9; C; 13D9; # CHEROKEE SMALL LETTER DO +ABAA; C; 13DA; # CHEROKEE SMALL LETTER DU +ABAB; C; 13DB; # CHEROKEE SMALL LETTER DV +ABAC; C; 13DC; # CHEROKEE SMALL LETTER DLA +ABAD; C; 13DD; # CHEROKEE SMALL LETTER TLA +ABAE; C; 13DE; # CHEROKEE SMALL LETTER TLE +ABAF; C; 13DF; # CHEROKEE SMALL LETTER TLI +ABB0; C; 13E0; # CHEROKEE SMALL LETTER TLO +ABB1; C; 13E1; # CHEROKEE SMALL LETTER TLU +ABB2; C; 13E2; # CHEROKEE SMALL LETTER TLV +ABB3; C; 13E3; # CHEROKEE SMALL LETTER TSA +ABB4; C; 13E4; # CHEROKEE SMALL LETTER TSE +ABB5; C; 13E5; # CHEROKEE SMALL LETTER TSI +ABB6; C; 13E6; # CHEROKEE SMALL LETTER TSO +ABB7; C; 13E7; # CHEROKEE SMALL LETTER TSU +ABB8; C; 13E8; # CHEROKEE SMALL LETTER TSV +ABB9; C; 13E9; # CHEROKEE SMALL LETTER WA +ABBA; C; 13EA; # CHEROKEE SMALL LETTER WE +ABBB; C; 13EB; # CHEROKEE SMALL LETTER WI +ABBC; C; 13EC; # CHEROKEE SMALL LETTER WO +ABBD; C; 13ED; # CHEROKEE SMALL LETTER WU +ABBE; C; 13EE; # CHEROKEE SMALL LETTER WV +ABBF; C; 13EF; # CHEROKEE SMALL LETTER YA +FB00; F; 0066 0066; # LATIN SMALL LIGATURE FF +FB01; F; 0066 0069; # LATIN SMALL LIGATURE FI +FB02; F; 0066 006C; # LATIN SMALL LIGATURE FL +FB03; F; 0066 0066 0069; # LATIN SMALL LIGATURE FFI +FB04; F; 0066 0066 006C; # LATIN SMALL LIGATURE FFL +FB05; F; 0073 0074; # LATIN SMALL LIGATURE LONG S T +FB05; S; FB06; # LATIN SMALL LIGATURE LONG S T +FB06; F; 0073 0074; # LATIN SMALL LIGATURE ST +FB13; F; 0574 0576; # ARMENIAN SMALL LIGATURE MEN NOW +FB14; F; 0574 0565; # ARMENIAN SMALL LIGATURE MEN ECH +FB15; F; 0574 056B; # ARMENIAN SMALL LIGATURE MEN INI +FB16; F; 057E 0576; # ARMENIAN SMALL LIGATURE VEW NOW +FB17; F; 0574 056D; # ARMENIAN SMALL LIGATURE MEN XEH +FF21; C; FF41; # FULLWIDTH LATIN CAPITAL LETTER A +FF22; C; FF42; # FULLWIDTH LATIN CAPITAL LETTER B +FF23; C; FF43; # FULLWIDTH LATIN CAPITAL LETTER C +FF24; C; FF44; # FULLWIDTH LATIN CAPITAL LETTER D +FF25; C; FF45; # FULLWIDTH LATIN CAPITAL LETTER E +FF26; C; FF46; # FULLWIDTH LATIN CAPITAL LETTER F +FF27; C; FF47; # FULLWIDTH LATIN CAPITAL LETTER G +FF28; C; FF48; # FULLWIDTH LATIN CAPITAL LETTER H +FF29; C; FF49; # FULLWIDTH LATIN CAPITAL LETTER I +FF2A; C; FF4A; # FULLWIDTH LATIN CAPITAL LETTER J +FF2B; C; FF4B; # FULLWIDTH LATIN CAPITAL LETTER K +FF2C; C; FF4C; # FULLWIDTH LATIN CAPITAL LETTER L +FF2D; C; FF4D; # FULLWIDTH LATIN CAPITAL LETTER M +FF2E; C; FF4E; # FULLWIDTH LATIN CAPITAL LETTER N +FF2F; C; FF4F; # FULLWIDTH LATIN CAPITAL LETTER O +FF30; C; FF50; # FULLWIDTH LATIN CAPITAL LETTER P +FF31; C; FF51; # FULLWIDTH LATIN CAPITAL LETTER Q +FF32; C; FF52; # FULLWIDTH LATIN CAPITAL LETTER R +FF33; C; FF53; # FULLWIDTH LATIN CAPITAL LETTER S +FF34; C; FF54; # FULLWIDTH LATIN CAPITAL LETTER T +FF35; C; FF55; # FULLWIDTH LATIN CAPITAL LETTER U +FF36; C; FF56; # FULLWIDTH LATIN CAPITAL LETTER V +FF37; C; FF57; # FULLWIDTH LATIN CAPITAL LETTER W +FF38; C; FF58; # FULLWIDTH LATIN CAPITAL LETTER X +FF39; C; FF59; # FULLWIDTH LATIN CAPITAL LETTER Y +FF3A; C; FF5A; # FULLWIDTH LATIN CAPITAL LETTER Z +10400; C; 10428; # DESERET CAPITAL LETTER LONG I +10401; C; 10429; # DESERET CAPITAL LETTER LONG E +10402; C; 1042A; # DESERET CAPITAL LETTER LONG A +10403; C; 1042B; # DESERET CAPITAL LETTER LONG AH +10404; C; 1042C; # DESERET CAPITAL LETTER LONG O +10405; C; 1042D; # DESERET CAPITAL LETTER LONG OO +10406; C; 1042E; # DESERET CAPITAL LETTER SHORT I +10407; C; 1042F; # DESERET CAPITAL LETTER SHORT E +10408; C; 10430; # DESERET CAPITAL LETTER SHORT A +10409; C; 10431; # DESERET CAPITAL LETTER SHORT AH +1040A; C; 10432; # DESERET CAPITAL LETTER SHORT O +1040B; C; 10433; # DESERET CAPITAL LETTER SHORT OO +1040C; C; 10434; # DESERET CAPITAL LETTER AY +1040D; C; 10435; # DESERET CAPITAL LETTER OW +1040E; C; 10436; # DESERET CAPITAL LETTER WU +1040F; C; 10437; # DESERET CAPITAL LETTER YEE +10410; C; 10438; # DESERET CAPITAL LETTER H +10411; C; 10439; # DESERET CAPITAL LETTER PEE +10412; C; 1043A; # DESERET CAPITAL LETTER BEE +10413; C; 1043B; # DESERET CAPITAL LETTER TEE +10414; C; 1043C; # DESERET CAPITAL LETTER DEE +10415; C; 1043D; # DESERET CAPITAL LETTER CHEE +10416; C; 1043E; # DESERET CAPITAL LETTER JEE +10417; C; 1043F; # DESERET CAPITAL LETTER KAY +10418; C; 10440; # DESERET CAPITAL LETTER GAY +10419; C; 10441; # DESERET CAPITAL LETTER EF +1041A; C; 10442; # DESERET CAPITAL LETTER VEE +1041B; C; 10443; # DESERET CAPITAL LETTER ETH +1041C; C; 10444; # DESERET CAPITAL LETTER THEE +1041D; C; 10445; # DESERET CAPITAL LETTER ES +1041E; C; 10446; # DESERET CAPITAL LETTER ZEE +1041F; C; 10447; # DESERET CAPITAL LETTER ESH +10420; C; 10448; # DESERET CAPITAL LETTER ZHEE +10421; C; 10449; # DESERET CAPITAL LETTER ER +10422; C; 1044A; # DESERET CAPITAL LETTER EL +10423; C; 1044B; # DESERET CAPITAL LETTER EM +10424; C; 1044C; # DESERET CAPITAL LETTER EN +10425; C; 1044D; # DESERET CAPITAL LETTER ENG +10426; C; 1044E; # DESERET CAPITAL LETTER OI +10427; C; 1044F; # DESERET CAPITAL LETTER EW +104B0; C; 104D8; # OSAGE CAPITAL LETTER A +104B1; C; 104D9; # OSAGE CAPITAL LETTER AI +104B2; C; 104DA; # OSAGE CAPITAL LETTER AIN +104B3; C; 104DB; # OSAGE CAPITAL LETTER AH +104B4; C; 104DC; # OSAGE CAPITAL LETTER BRA +104B5; C; 104DD; # OSAGE CAPITAL LETTER CHA +104B6; C; 104DE; # OSAGE CAPITAL LETTER EHCHA +104B7; C; 104DF; # OSAGE CAPITAL LETTER E +104B8; C; 104E0; # OSAGE CAPITAL LETTER EIN +104B9; C; 104E1; # OSAGE CAPITAL LETTER HA +104BA; C; 104E2; # OSAGE CAPITAL LETTER HYA +104BB; C; 104E3; # OSAGE CAPITAL LETTER I +104BC; C; 104E4; # OSAGE CAPITAL LETTER KA +104BD; C; 104E5; # OSAGE CAPITAL LETTER EHKA +104BE; C; 104E6; # OSAGE CAPITAL LETTER KYA +104BF; C; 104E7; # OSAGE CAPITAL LETTER LA +104C0; C; 104E8; # OSAGE CAPITAL LETTER MA +104C1; C; 104E9; # OSAGE CAPITAL LETTER NA +104C2; C; 104EA; # OSAGE CAPITAL LETTER O +104C3; C; 104EB; # OSAGE CAPITAL LETTER OIN +104C4; C; 104EC; # OSAGE CAPITAL LETTER PA +104C5; C; 104ED; # OSAGE CAPITAL LETTER EHPA +104C6; C; 104EE; # OSAGE CAPITAL LETTER SA +104C7; C; 104EF; # OSAGE CAPITAL LETTER SHA +104C8; C; 104F0; # OSAGE CAPITAL LETTER TA +104C9; C; 104F1; # OSAGE CAPITAL LETTER EHTA +104CA; C; 104F2; # OSAGE CAPITAL LETTER TSA +104CB; C; 104F3; # OSAGE CAPITAL LETTER EHTSA +104CC; C; 104F4; # OSAGE CAPITAL LETTER TSHA +104CD; C; 104F5; # OSAGE CAPITAL LETTER DHA +104CE; C; 104F6; # OSAGE CAPITAL LETTER U +104CF; C; 104F7; # OSAGE CAPITAL LETTER WA +104D0; C; 104F8; # OSAGE CAPITAL LETTER KHA +104D1; C; 104F9; # OSAGE CAPITAL LETTER GHA +104D2; C; 104FA; # OSAGE CAPITAL LETTER ZA +104D3; C; 104FB; # OSAGE CAPITAL LETTER ZHA +10570; C; 10597; # VITHKUQI CAPITAL LETTER A +10571; C; 10598; # VITHKUQI CAPITAL LETTER BBE +10572; C; 10599; # VITHKUQI CAPITAL LETTER BE +10573; C; 1059A; # VITHKUQI CAPITAL LETTER CE +10574; C; 1059B; # VITHKUQI CAPITAL LETTER CHE +10575; C; 1059C; # VITHKUQI CAPITAL LETTER DE +10576; C; 1059D; # VITHKUQI CAPITAL LETTER DHE +10577; C; 1059E; # VITHKUQI CAPITAL LETTER EI +10578; C; 1059F; # VITHKUQI CAPITAL LETTER E +10579; C; 105A0; # VITHKUQI CAPITAL LETTER FE +1057A; C; 105A1; # VITHKUQI CAPITAL LETTER GA +1057C; C; 105A3; # VITHKUQI CAPITAL LETTER HA +1057D; C; 105A4; # VITHKUQI CAPITAL LETTER HHA +1057E; C; 105A5; # VITHKUQI CAPITAL LETTER I +1057F; C; 105A6; # VITHKUQI CAPITAL LETTER IJE +10580; C; 105A7; # VITHKUQI CAPITAL LETTER JE +10581; C; 105A8; # VITHKUQI CAPITAL LETTER KA +10582; C; 105A9; # VITHKUQI CAPITAL LETTER LA +10583; C; 105AA; # VITHKUQI CAPITAL LETTER LLA +10584; C; 105AB; # VITHKUQI CAPITAL LETTER ME +10585; C; 105AC; # VITHKUQI CAPITAL LETTER NE +10586; C; 105AD; # VITHKUQI CAPITAL LETTER NJE +10587; C; 105AE; # VITHKUQI CAPITAL LETTER O +10588; C; 105AF; # VITHKUQI CAPITAL LETTER PE +10589; C; 105B0; # VITHKUQI CAPITAL LETTER QA +1058A; C; 105B1; # VITHKUQI CAPITAL LETTER RE +1058C; C; 105B3; # VITHKUQI CAPITAL LETTER SE +1058D; C; 105B4; # VITHKUQI CAPITAL LETTER SHE +1058E; C; 105B5; # VITHKUQI CAPITAL LETTER TE +1058F; C; 105B6; # VITHKUQI CAPITAL LETTER THE +10590; C; 105B7; # VITHKUQI CAPITAL LETTER U +10591; C; 105B8; # VITHKUQI CAPITAL LETTER VE +10592; C; 105B9; # VITHKUQI CAPITAL LETTER XE +10594; C; 105BB; # VITHKUQI CAPITAL LETTER Y +10595; C; 105BC; # VITHKUQI CAPITAL LETTER ZE +10C80; C; 10CC0; # OLD HUNGARIAN CAPITAL LETTER A +10C81; C; 10CC1; # OLD HUNGARIAN CAPITAL LETTER AA +10C82; C; 10CC2; # OLD HUNGARIAN CAPITAL LETTER EB +10C83; C; 10CC3; # OLD HUNGARIAN CAPITAL LETTER AMB +10C84; C; 10CC4; # OLD HUNGARIAN CAPITAL LETTER EC +10C85; C; 10CC5; # OLD HUNGARIAN CAPITAL LETTER ENC +10C86; C; 10CC6; # OLD HUNGARIAN CAPITAL LETTER ECS +10C87; C; 10CC7; # OLD HUNGARIAN CAPITAL LETTER ED +10C88; C; 10CC8; # OLD HUNGARIAN CAPITAL LETTER AND +10C89; C; 10CC9; # OLD HUNGARIAN CAPITAL LETTER E +10C8A; C; 10CCA; # OLD HUNGARIAN CAPITAL LETTER CLOSE E +10C8B; C; 10CCB; # OLD HUNGARIAN CAPITAL LETTER EE +10C8C; C; 10CCC; # OLD HUNGARIAN CAPITAL LETTER EF +10C8D; C; 10CCD; # OLD HUNGARIAN CAPITAL LETTER EG +10C8E; C; 10CCE; # OLD HUNGARIAN CAPITAL LETTER EGY +10C8F; C; 10CCF; # OLD HUNGARIAN CAPITAL LETTER EH +10C90; C; 10CD0; # OLD HUNGARIAN CAPITAL LETTER I +10C91; C; 10CD1; # OLD HUNGARIAN CAPITAL LETTER II +10C92; C; 10CD2; # OLD HUNGARIAN CAPITAL LETTER EJ +10C93; C; 10CD3; # OLD HUNGARIAN CAPITAL LETTER EK +10C94; C; 10CD4; # OLD HUNGARIAN CAPITAL LETTER AK +10C95; C; 10CD5; # OLD HUNGARIAN CAPITAL LETTER UNK +10C96; C; 10CD6; # OLD HUNGARIAN CAPITAL LETTER EL +10C97; C; 10CD7; # OLD HUNGARIAN CAPITAL LETTER ELY +10C98; C; 10CD8; # OLD HUNGARIAN CAPITAL LETTER EM +10C99; C; 10CD9; # OLD HUNGARIAN CAPITAL LETTER EN +10C9A; C; 10CDA; # OLD HUNGARIAN CAPITAL LETTER ENY +10C9B; C; 10CDB; # OLD HUNGARIAN CAPITAL LETTER O +10C9C; C; 10CDC; # OLD HUNGARIAN CAPITAL LETTER OO +10C9D; C; 10CDD; # OLD HUNGARIAN CAPITAL LETTER NIKOLSBURG OE +10C9E; C; 10CDE; # OLD HUNGARIAN CAPITAL LETTER RUDIMENTA OE +10C9F; C; 10CDF; # OLD HUNGARIAN CAPITAL LETTER OEE +10CA0; C; 10CE0; # OLD HUNGARIAN CAPITAL LETTER EP +10CA1; C; 10CE1; # OLD HUNGARIAN CAPITAL LETTER EMP +10CA2; C; 10CE2; # OLD HUNGARIAN CAPITAL LETTER ER +10CA3; C; 10CE3; # OLD HUNGARIAN CAPITAL LETTER SHORT ER +10CA4; C; 10CE4; # OLD HUNGARIAN CAPITAL LETTER ES +10CA5; C; 10CE5; # OLD HUNGARIAN CAPITAL LETTER ESZ +10CA6; C; 10CE6; # OLD HUNGARIAN CAPITAL LETTER ET +10CA7; C; 10CE7; # OLD HUNGARIAN CAPITAL LETTER ENT +10CA8; C; 10CE8; # OLD HUNGARIAN CAPITAL LETTER ETY +10CA9; C; 10CE9; # OLD HUNGARIAN CAPITAL LETTER ECH +10CAA; C; 10CEA; # OLD HUNGARIAN CAPITAL LETTER U +10CAB; C; 10CEB; # OLD HUNGARIAN CAPITAL LETTER UU +10CAC; C; 10CEC; # OLD HUNGARIAN CAPITAL LETTER NIKOLSBURG UE +10CAD; C; 10CED; # OLD HUNGARIAN CAPITAL LETTER RUDIMENTA UE +10CAE; C; 10CEE; # OLD HUNGARIAN CAPITAL LETTER EV +10CAF; C; 10CEF; # OLD HUNGARIAN CAPITAL LETTER EZ +10CB0; C; 10CF0; # OLD HUNGARIAN CAPITAL LETTER EZS +10CB1; C; 10CF1; # OLD HUNGARIAN CAPITAL LETTER ENT-SHAPED SIGN +10CB2; C; 10CF2; # OLD HUNGARIAN CAPITAL LETTER US +10D50; C; 10D70; # GARAY CAPITAL LETTER A +10D51; C; 10D71; # GARAY CAPITAL LETTER CA +10D52; C; 10D72; # GARAY CAPITAL LETTER MA +10D53; C; 10D73; # GARAY CAPITAL LETTER KA +10D54; C; 10D74; # GARAY CAPITAL LETTER BA +10D55; C; 10D75; # GARAY CAPITAL LETTER JA +10D56; C; 10D76; # GARAY CAPITAL LETTER SA +10D57; C; 10D77; # GARAY CAPITAL LETTER WA +10D58; C; 10D78; # GARAY CAPITAL LETTER LA +10D59; C; 10D79; # GARAY CAPITAL LETTER GA +10D5A; C; 10D7A; # GARAY CAPITAL LETTER DA +10D5B; C; 10D7B; # GARAY CAPITAL LETTER XA +10D5C; C; 10D7C; # GARAY CAPITAL LETTER YA +10D5D; C; 10D7D; # GARAY CAPITAL LETTER TA +10D5E; C; 10D7E; # GARAY CAPITAL LETTER RA +10D5F; C; 10D7F; # GARAY CAPITAL LETTER NYA +10D60; C; 10D80; # GARAY CAPITAL LETTER FA +10D61; C; 10D81; # GARAY CAPITAL LETTER NA +10D62; C; 10D82; # GARAY CAPITAL LETTER PA +10D63; C; 10D83; # GARAY CAPITAL LETTER HA +10D64; C; 10D84; # GARAY CAPITAL LETTER OLD KA +10D65; C; 10D85; # GARAY CAPITAL LETTER OLD NA +118A0; C; 118C0; # WARANG CITI CAPITAL LETTER NGAA +118A1; C; 118C1; # WARANG CITI CAPITAL LETTER A +118A2; C; 118C2; # WARANG CITI CAPITAL LETTER WI +118A3; C; 118C3; # WARANG CITI CAPITAL LETTER YU +118A4; C; 118C4; # WARANG CITI CAPITAL LETTER YA +118A5; C; 118C5; # WARANG CITI CAPITAL LETTER YO +118A6; C; 118C6; # WARANG CITI CAPITAL LETTER II +118A7; C; 118C7; # WARANG CITI CAPITAL LETTER UU +118A8; C; 118C8; # WARANG CITI CAPITAL LETTER E +118A9; C; 118C9; # WARANG CITI CAPITAL LETTER O +118AA; C; 118CA; # WARANG CITI CAPITAL LETTER ANG +118AB; C; 118CB; # WARANG CITI CAPITAL LETTER GA +118AC; C; 118CC; # WARANG CITI CAPITAL LETTER KO +118AD; C; 118CD; # WARANG CITI CAPITAL LETTER ENY +118AE; C; 118CE; # WARANG CITI CAPITAL LETTER YUJ +118AF; C; 118CF; # WARANG CITI CAPITAL LETTER UC +118B0; C; 118D0; # WARANG CITI CAPITAL LETTER ENN +118B1; C; 118D1; # WARANG CITI CAPITAL LETTER ODD +118B2; C; 118D2; # WARANG CITI CAPITAL LETTER TTE +118B3; C; 118D3; # WARANG CITI CAPITAL LETTER NUNG +118B4; C; 118D4; # WARANG CITI CAPITAL LETTER DA +118B5; C; 118D5; # WARANG CITI CAPITAL LETTER AT +118B6; C; 118D6; # WARANG CITI CAPITAL LETTER AM +118B7; C; 118D7; # WARANG CITI CAPITAL LETTER BU +118B8; C; 118D8; # WARANG CITI CAPITAL LETTER PU +118B9; C; 118D9; # WARANG CITI CAPITAL LETTER HIYO +118BA; C; 118DA; # WARANG CITI CAPITAL LETTER HOLO +118BB; C; 118DB; # WARANG CITI CAPITAL LETTER HORR +118BC; C; 118DC; # WARANG CITI CAPITAL LETTER HAR +118BD; C; 118DD; # WARANG CITI CAPITAL LETTER SSUU +118BE; C; 118DE; # WARANG CITI CAPITAL LETTER SII +118BF; C; 118DF; # WARANG CITI CAPITAL LETTER VIYO +16E40; C; 16E60; # MEDEFAIDRIN CAPITAL LETTER M +16E41; C; 16E61; # MEDEFAIDRIN CAPITAL LETTER S +16E42; C; 16E62; # MEDEFAIDRIN CAPITAL LETTER V +16E43; C; 16E63; # MEDEFAIDRIN CAPITAL LETTER W +16E44; C; 16E64; # MEDEFAIDRIN CAPITAL LETTER ATIU +16E45; C; 16E65; # MEDEFAIDRIN CAPITAL LETTER Z +16E46; C; 16E66; # MEDEFAIDRIN CAPITAL LETTER KP +16E47; C; 16E67; # MEDEFAIDRIN CAPITAL LETTER P +16E48; C; 16E68; # MEDEFAIDRIN CAPITAL LETTER T +16E49; C; 16E69; # MEDEFAIDRIN CAPITAL LETTER G +16E4A; C; 16E6A; # MEDEFAIDRIN CAPITAL LETTER F +16E4B; C; 16E6B; # MEDEFAIDRIN CAPITAL LETTER I +16E4C; C; 16E6C; # MEDEFAIDRIN CAPITAL LETTER K +16E4D; C; 16E6D; # MEDEFAIDRIN CAPITAL LETTER A +16E4E; C; 16E6E; # MEDEFAIDRIN CAPITAL LETTER J +16E4F; C; 16E6F; # MEDEFAIDRIN CAPITAL LETTER E +16E50; C; 16E70; # MEDEFAIDRIN CAPITAL LETTER B +16E51; C; 16E71; # MEDEFAIDRIN CAPITAL LETTER C +16E52; C; 16E72; # MEDEFAIDRIN CAPITAL LETTER U +16E53; C; 16E73; # MEDEFAIDRIN CAPITAL LETTER YU +16E54; C; 16E74; # MEDEFAIDRIN CAPITAL LETTER L +16E55; C; 16E75; # MEDEFAIDRIN CAPITAL LETTER Q +16E56; C; 16E76; # MEDEFAIDRIN CAPITAL LETTER HP +16E57; C; 16E77; # MEDEFAIDRIN CAPITAL LETTER NY +16E58; C; 16E78; # MEDEFAIDRIN CAPITAL LETTER X +16E59; C; 16E79; # MEDEFAIDRIN CAPITAL LETTER D +16E5A; C; 16E7A; # MEDEFAIDRIN CAPITAL LETTER OE +16E5B; C; 16E7B; # MEDEFAIDRIN CAPITAL LETTER N +16E5C; C; 16E7C; # MEDEFAIDRIN CAPITAL LETTER R +16E5D; C; 16E7D; # MEDEFAIDRIN CAPITAL LETTER O +16E5E; C; 16E7E; # MEDEFAIDRIN CAPITAL LETTER AI +16E5F; C; 16E7F; # MEDEFAIDRIN CAPITAL LETTER Y +1E900; C; 1E922; # ADLAM CAPITAL LETTER ALIF +1E901; C; 1E923; # ADLAM CAPITAL LETTER DAALI +1E902; C; 1E924; # ADLAM CAPITAL LETTER LAAM +1E903; C; 1E925; # ADLAM CAPITAL LETTER MIIM +1E904; C; 1E926; # ADLAM CAPITAL LETTER BA +1E905; C; 1E927; # ADLAM CAPITAL LETTER SINNYIIYHE +1E906; C; 1E928; # ADLAM CAPITAL LETTER PE +1E907; C; 1E929; # ADLAM CAPITAL LETTER BHE +1E908; C; 1E92A; # ADLAM CAPITAL LETTER RA +1E909; C; 1E92B; # ADLAM CAPITAL LETTER E +1E90A; C; 1E92C; # ADLAM CAPITAL LETTER FA +1E90B; C; 1E92D; # ADLAM CAPITAL LETTER I +1E90C; C; 1E92E; # ADLAM CAPITAL LETTER O +1E90D; C; 1E92F; # ADLAM CAPITAL LETTER DHA +1E90E; C; 1E930; # ADLAM CAPITAL LETTER YHE +1E90F; C; 1E931; # ADLAM CAPITAL LETTER WAW +1E910; C; 1E932; # ADLAM CAPITAL LETTER NUN +1E911; C; 1E933; # ADLAM CAPITAL LETTER KAF +1E912; C; 1E934; # ADLAM CAPITAL LETTER YA +1E913; C; 1E935; # ADLAM CAPITAL LETTER U +1E914; C; 1E936; # ADLAM CAPITAL LETTER JIIM +1E915; C; 1E937; # ADLAM CAPITAL LETTER CHI +1E916; C; 1E938; # ADLAM CAPITAL LETTER HA +1E917; C; 1E939; # ADLAM CAPITAL LETTER QAAF +1E918; C; 1E93A; # ADLAM CAPITAL LETTER GA +1E919; C; 1E93B; # ADLAM CAPITAL LETTER NYA +1E91A; C; 1E93C; # ADLAM CAPITAL LETTER TU +1E91B; C; 1E93D; # ADLAM CAPITAL LETTER NHA +1E91C; C; 1E93E; # ADLAM CAPITAL LETTER VA +1E91D; C; 1E93F; # ADLAM CAPITAL LETTER KHA +1E91E; C; 1E940; # ADLAM CAPITAL LETTER GBE +1E91F; C; 1E941; # ADLAM CAPITAL LETTER ZAL +1E920; C; 1E942; # ADLAM CAPITAL LETTER KPO +1E921; C; 1E943; # ADLAM CAPITAL LETTER SHA +# +# EOF diff --git a/doc/CaseFolding.txt b/doc/CaseFolding.txt new file mode 100644 index 00000000..1b7a9c15 --- /dev/null +++ b/doc/CaseFolding.txt @@ -0,0 +1,1654 @@ +# CaseFolding-16.0.0.txt +# Date: 2024-04-30, 21:48:11 GMT +# © 2024 Unicode®, Inc. +# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. +# For terms of use and license, see https://www.unicode.org/terms_of_use.html +# +# Unicode Character Database +# For documentation, see https://www.unicode.org/reports/tr44/ +# +# Case Folding Properties +# +# This file is a supplement to the UnicodeData file. +# It provides a case folding mapping generated from the Unicode Character Database. +# If all characters are mapped according to the full mapping below, then +# case differences (according to UnicodeData.txt and SpecialCasing.txt) +# are eliminated. +# +# The data supports both implementations that require simple case foldings +# (where string lengths don't change), and implementations that allow full case folding +# (where string lengths may grow). Note that where they can be supported, the +# full case foldings are superior: for example, they allow "MASSE" and "Maße" to match. +# +# All code points not listed in this file map to themselves. +# +# NOTE: case folding does not preserve normalization formats! +# +# For information on case folding, including how to have case folding +# preserve normalization formats, see Section 3.13 Default Case Algorithms in +# The Unicode Standard. +# +# ================================================================================ +# Format +# ================================================================================ +# The entries in this file are in the following machine-readable format: +# +# ; ; ; # +# +# The status field is: +# C: common case folding, common mappings shared by both simple and full mappings. +# F: full case folding, mappings that cause strings to grow in length. Multiple characters are separated by spaces. +# S: simple case folding, mappings to single characters where different from F. +# T: special case for uppercase I and dotted uppercase I +# - For non-Turkic languages, this mapping is normally not used. +# - For Turkic languages (tr, az), this mapping can be used instead of the normal mapping for these characters. +# Note that the Turkic mappings do not maintain canonical equivalence without additional processing. +# See the discussions of case mapping in the Unicode Standard for more information. +# +# Usage: +# A. To do a simple case folding, use the mappings with status C + S. +# B. To do a full case folding, use the mappings with status C + F. +# +# The mappings with status T can be used or omitted depending on the desired case-folding +# behavior. (The default option is to exclude them.) +# +# ================================================================= + +# Property: Case_Folding + +# All code points not explicitly listed for Case_Folding +# have the value C for the status field, and the code point itself for the mapping field. + +# ================================================================= +0041; C; 0061; # LATIN CAPITAL LETTER A +0042; C; 0062; # LATIN CAPITAL LETTER B +0043; C; 0063; # LATIN CAPITAL LETTER C +0044; C; 0064; # LATIN CAPITAL LETTER D +0045; C; 0065; # LATIN CAPITAL LETTER E +0046; C; 0066; # LATIN CAPITAL LETTER F +0047; C; 0067; # LATIN CAPITAL LETTER G +0048; C; 0068; # LATIN CAPITAL LETTER H +0049; C; 0069; # LATIN CAPITAL LETTER I +0049; T; 0131; # LATIN CAPITAL LETTER I +004A; C; 006A; # LATIN CAPITAL LETTER J +004B; C; 006B; # LATIN CAPITAL LETTER K +004C; C; 006C; # LATIN CAPITAL LETTER L +004D; C; 006D; # LATIN CAPITAL LETTER M +004E; C; 006E; # LATIN CAPITAL LETTER N +004F; C; 006F; # LATIN CAPITAL LETTER O +0050; C; 0070; # LATIN CAPITAL LETTER P +0051; C; 0071; # LATIN CAPITAL LETTER Q +0052; C; 0072; # LATIN CAPITAL LETTER R +0053; C; 0073; # LATIN CAPITAL LETTER S +0054; C; 0074; # LATIN CAPITAL LETTER T +0055; C; 0075; # LATIN CAPITAL LETTER U +0056; C; 0076; # LATIN CAPITAL LETTER V +0057; C; 0077; # LATIN CAPITAL LETTER W +0058; C; 0078; # LATIN CAPITAL LETTER X +0059; C; 0079; # LATIN CAPITAL LETTER Y +005A; C; 007A; # LATIN CAPITAL LETTER Z +00B5; C; 03BC; # MICRO SIGN +00C0; C; 00E0; # LATIN CAPITAL LETTER A WITH GRAVE +00C1; C; 00E1; # LATIN CAPITAL LETTER A WITH ACUTE +00C2; C; 00E2; # LATIN CAPITAL LETTER A WITH CIRCUMFLEX +00C3; C; 00E3; # LATIN CAPITAL LETTER A WITH TILDE +00C4; C; 00E4; # LATIN CAPITAL LETTER A WITH DIAERESIS +00C5; C; 00E5; # LATIN CAPITAL LETTER A WITH RING ABOVE +00C6; C; 00E6; # LATIN CAPITAL LETTER AE +00C7; C; 00E7; # LATIN CAPITAL LETTER C WITH CEDILLA +00C8; C; 00E8; # LATIN CAPITAL LETTER E WITH GRAVE +00C9; C; 00E9; # LATIN CAPITAL LETTER E WITH ACUTE +00CA; C; 00EA; # LATIN CAPITAL LETTER E WITH CIRCUMFLEX +00CB; C; 00EB; # LATIN CAPITAL LETTER E WITH DIAERESIS +00CC; C; 00EC; # LATIN CAPITAL LETTER I WITH GRAVE +00CD; C; 00ED; # LATIN CAPITAL LETTER I WITH ACUTE +00CE; C; 00EE; # LATIN CAPITAL LETTER I WITH CIRCUMFLEX +00CF; C; 00EF; # LATIN CAPITAL LETTER I WITH DIAERESIS +00D0; C; 00F0; # LATIN CAPITAL LETTER ETH +00D1; C; 00F1; # LATIN CAPITAL LETTER N WITH TILDE +00D2; C; 00F2; # LATIN CAPITAL LETTER O WITH GRAVE +00D3; C; 00F3; # LATIN CAPITAL LETTER O WITH ACUTE +00D4; C; 00F4; # LATIN CAPITAL LETTER O WITH CIRCUMFLEX +00D5; C; 00F5; # LATIN CAPITAL LETTER O WITH TILDE +00D6; C; 00F6; # LATIN CAPITAL LETTER O WITH DIAERESIS +00D8; C; 00F8; # LATIN CAPITAL LETTER O WITH STROKE +00D9; C; 00F9; # LATIN CAPITAL LETTER U WITH GRAVE +00DA; C; 00FA; # LATIN CAPITAL LETTER U WITH ACUTE +00DB; C; 00FB; # LATIN CAPITAL LETTER U WITH CIRCUMFLEX +00DC; C; 00FC; # LATIN CAPITAL LETTER U WITH DIAERESIS +00DD; C; 00FD; # LATIN CAPITAL LETTER Y WITH ACUTE +00DE; C; 00FE; # LATIN CAPITAL LETTER THORN +00DF; F; 0073 0073; # LATIN SMALL LETTER SHARP S +0100; C; 0101; # LATIN CAPITAL LETTER A WITH MACRON +0102; C; 0103; # LATIN CAPITAL LETTER A WITH BREVE +0104; C; 0105; # LATIN CAPITAL LETTER A WITH OGONEK +0106; C; 0107; # LATIN CAPITAL LETTER C WITH ACUTE +0108; C; 0109; # LATIN CAPITAL LETTER C WITH CIRCUMFLEX +010A; C; 010B; # LATIN CAPITAL LETTER C WITH DOT ABOVE +010C; C; 010D; # LATIN CAPITAL LETTER C WITH CARON +010E; C; 010F; # LATIN CAPITAL LETTER D WITH CARON +0110; C; 0111; # LATIN CAPITAL LETTER D WITH STROKE +0112; C; 0113; # LATIN CAPITAL LETTER E WITH MACRON +0114; C; 0115; # LATIN CAPITAL LETTER E WITH BREVE +0116; C; 0117; # LATIN CAPITAL LETTER E WITH DOT ABOVE +0118; C; 0119; # LATIN CAPITAL LETTER E WITH OGONEK +011A; C; 011B; # LATIN CAPITAL LETTER E WITH CARON +011C; C; 011D; # LATIN CAPITAL LETTER G WITH CIRCUMFLEX +011E; C; 011F; # LATIN CAPITAL LETTER G WITH BREVE +0120; C; 0121; # LATIN CAPITAL LETTER G WITH DOT ABOVE +0122; C; 0123; # LATIN CAPITAL LETTER G WITH CEDILLA +0124; C; 0125; # LATIN CAPITAL LETTER H WITH CIRCUMFLEX +0126; C; 0127; # LATIN CAPITAL LETTER H WITH STROKE +0128; C; 0129; # LATIN CAPITAL LETTER I WITH TILDE +012A; C; 012B; # LATIN CAPITAL LETTER I WITH MACRON +012C; C; 012D; # LATIN CAPITAL LETTER I WITH BREVE +012E; C; 012F; # LATIN CAPITAL LETTER I WITH OGONEK +0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE +0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE +0132; C; 0133; # LATIN CAPITAL LIGATURE IJ +0134; C; 0135; # LATIN CAPITAL LETTER J WITH CIRCUMFLEX +0136; C; 0137; # LATIN CAPITAL LETTER K WITH CEDILLA +0139; C; 013A; # LATIN CAPITAL LETTER L WITH ACUTE +013B; C; 013C; # LATIN CAPITAL LETTER L WITH CEDILLA +013D; C; 013E; # LATIN CAPITAL LETTER L WITH CARON +013F; C; 0140; # LATIN CAPITAL LETTER L WITH MIDDLE DOT +0141; C; 0142; # LATIN CAPITAL LETTER L WITH STROKE +0143; C; 0144; # LATIN CAPITAL LETTER N WITH ACUTE +0145; C; 0146; # LATIN CAPITAL LETTER N WITH CEDILLA +0147; C; 0148; # LATIN CAPITAL LETTER N WITH CARON +0149; F; 02BC 006E; # LATIN SMALL LETTER N PRECEDED BY APOSTROPHE +014A; C; 014B; # LATIN CAPITAL LETTER ENG +014C; C; 014D; # LATIN CAPITAL LETTER O WITH MACRON +014E; C; 014F; # LATIN CAPITAL LETTER O WITH BREVE +0150; C; 0151; # LATIN CAPITAL LETTER O WITH DOUBLE ACUTE +0152; C; 0153; # LATIN CAPITAL LIGATURE OE +0154; C; 0155; # LATIN CAPITAL LETTER R WITH ACUTE +0156; C; 0157; # LATIN CAPITAL LETTER R WITH CEDILLA +0158; C; 0159; # LATIN CAPITAL LETTER R WITH CARON +015A; C; 015B; # LATIN CAPITAL LETTER S WITH ACUTE +015C; C; 015D; # LATIN CAPITAL LETTER S WITH CIRCUMFLEX +015E; C; 015F; # LATIN CAPITAL LETTER S WITH CEDILLA +0160; C; 0161; # LATIN CAPITAL LETTER S WITH CARON +0162; C; 0163; # LATIN CAPITAL LETTER T WITH CEDILLA +0164; C; 0165; # LATIN CAPITAL LETTER T WITH CARON +0166; C; 0167; # LATIN CAPITAL LETTER T WITH STROKE +0168; C; 0169; # LATIN CAPITAL LETTER U WITH TILDE +016A; C; 016B; # LATIN CAPITAL LETTER U WITH MACRON +016C; C; 016D; # LATIN CAPITAL LETTER U WITH BREVE +016E; C; 016F; # LATIN CAPITAL LETTER U WITH RING ABOVE +0170; C; 0171; # LATIN CAPITAL LETTER U WITH DOUBLE ACUTE +0172; C; 0173; # LATIN CAPITAL LETTER U WITH OGONEK +0174; C; 0175; # LATIN CAPITAL LETTER W WITH CIRCUMFLEX +0176; C; 0177; # LATIN CAPITAL LETTER Y WITH CIRCUMFLEX +0178; C; 00FF; # LATIN CAPITAL LETTER Y WITH DIAERESIS +0179; C; 017A; # LATIN CAPITAL LETTER Z WITH ACUTE +017B; C; 017C; # LATIN CAPITAL LETTER Z WITH DOT ABOVE +017D; C; 017E; # LATIN CAPITAL LETTER Z WITH CARON +017F; C; 0073; # LATIN SMALL LETTER LONG S +0181; C; 0253; # LATIN CAPITAL LETTER B WITH HOOK +0182; C; 0183; # LATIN CAPITAL LETTER B WITH TOPBAR +0184; C; 0185; # LATIN CAPITAL LETTER TONE SIX +0186; C; 0254; # LATIN CAPITAL LETTER OPEN O +0187; C; 0188; # LATIN CAPITAL LETTER C WITH HOOK +0189; C; 0256; # LATIN CAPITAL LETTER AFRICAN D +018A; C; 0257; # LATIN CAPITAL LETTER D WITH HOOK +018B; C; 018C; # LATIN CAPITAL LETTER D WITH TOPBAR +018E; C; 01DD; # LATIN CAPITAL LETTER REVERSED E +018F; C; 0259; # LATIN CAPITAL LETTER SCHWA +0190; C; 025B; # LATIN CAPITAL LETTER OPEN E +0191; C; 0192; # LATIN CAPITAL LETTER F WITH HOOK +0193; C; 0260; # LATIN CAPITAL LETTER G WITH HOOK +0194; C; 0263; # LATIN CAPITAL LETTER GAMMA +0196; C; 0269; # LATIN CAPITAL LETTER IOTA +0197; C; 0268; # LATIN CAPITAL LETTER I WITH STROKE +0198; C; 0199; # LATIN CAPITAL LETTER K WITH HOOK +019C; C; 026F; # LATIN CAPITAL LETTER TURNED M +019D; C; 0272; # LATIN CAPITAL LETTER N WITH LEFT HOOK +019F; C; 0275; # LATIN CAPITAL LETTER O WITH MIDDLE TILDE +01A0; C; 01A1; # LATIN CAPITAL LETTER O WITH HORN +01A2; C; 01A3; # LATIN CAPITAL LETTER OI +01A4; C; 01A5; # LATIN CAPITAL LETTER P WITH HOOK +01A6; C; 0280; # LATIN LETTER YR +01A7; C; 01A8; # LATIN CAPITAL LETTER TONE TWO +01A9; C; 0283; # LATIN CAPITAL LETTER ESH +01AC; C; 01AD; # LATIN CAPITAL LETTER T WITH HOOK +01AE; C; 0288; # LATIN CAPITAL LETTER T WITH RETROFLEX HOOK +01AF; C; 01B0; # LATIN CAPITAL LETTER U WITH HORN +01B1; C; 028A; # LATIN CAPITAL LETTER UPSILON +01B2; C; 028B; # LATIN CAPITAL LETTER V WITH HOOK +01B3; C; 01B4; # LATIN CAPITAL LETTER Y WITH HOOK +01B5; C; 01B6; # LATIN CAPITAL LETTER Z WITH STROKE +01B7; C; 0292; # LATIN CAPITAL LETTER EZH +01B8; C; 01B9; # LATIN CAPITAL LETTER EZH REVERSED +01BC; C; 01BD; # LATIN CAPITAL LETTER TONE FIVE +01C4; C; 01C6; # LATIN CAPITAL LETTER DZ WITH CARON +01C5; C; 01C6; # LATIN CAPITAL LETTER D WITH SMALL LETTER Z WITH CARON +01C7; C; 01C9; # LATIN CAPITAL LETTER LJ +01C8; C; 01C9; # LATIN CAPITAL LETTER L WITH SMALL LETTER J +01CA; C; 01CC; # LATIN CAPITAL LETTER NJ +01CB; C; 01CC; # LATIN CAPITAL LETTER N WITH SMALL LETTER J +01CD; C; 01CE; # LATIN CAPITAL LETTER A WITH CARON +01CF; C; 01D0; # LATIN CAPITAL LETTER I WITH CARON +01D1; C; 01D2; # LATIN CAPITAL LETTER O WITH CARON +01D3; C; 01D4; # LATIN CAPITAL LETTER U WITH CARON +01D5; C; 01D6; # LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON +01D7; C; 01D8; # LATIN CAPITAL LETTER U WITH DIAERESIS AND ACUTE +01D9; C; 01DA; # LATIN CAPITAL LETTER U WITH DIAERESIS AND CARON +01DB; C; 01DC; # LATIN CAPITAL LETTER U WITH DIAERESIS AND GRAVE +01DE; C; 01DF; # LATIN CAPITAL LETTER A WITH DIAERESIS AND MACRON +01E0; C; 01E1; # LATIN CAPITAL LETTER A WITH DOT ABOVE AND MACRON +01E2; C; 01E3; # LATIN CAPITAL LETTER AE WITH MACRON +01E4; C; 01E5; # LATIN CAPITAL LETTER G WITH STROKE +01E6; C; 01E7; # LATIN CAPITAL LETTER G WITH CARON +01E8; C; 01E9; # LATIN CAPITAL LETTER K WITH CARON +01EA; C; 01EB; # LATIN CAPITAL LETTER O WITH OGONEK +01EC; C; 01ED; # LATIN CAPITAL LETTER O WITH OGONEK AND MACRON +01EE; C; 01EF; # LATIN CAPITAL LETTER EZH WITH CARON +01F0; F; 006A 030C; # LATIN SMALL LETTER J WITH CARON +01F1; C; 01F3; # LATIN CAPITAL LETTER DZ +01F2; C; 01F3; # LATIN CAPITAL LETTER D WITH SMALL LETTER Z +01F4; C; 01F5; # LATIN CAPITAL LETTER G WITH ACUTE +01F6; C; 0195; # LATIN CAPITAL LETTER HWAIR +01F7; C; 01BF; # LATIN CAPITAL LETTER WYNN +01F8; C; 01F9; # LATIN CAPITAL LETTER N WITH GRAVE +01FA; C; 01FB; # LATIN CAPITAL LETTER A WITH RING ABOVE AND ACUTE +01FC; C; 01FD; # LATIN CAPITAL LETTER AE WITH ACUTE +01FE; C; 01FF; # LATIN CAPITAL LETTER O WITH STROKE AND ACUTE +0200; C; 0201; # LATIN CAPITAL LETTER A WITH DOUBLE GRAVE +0202; C; 0203; # LATIN CAPITAL LETTER A WITH INVERTED BREVE +0204; C; 0205; # LATIN CAPITAL LETTER E WITH DOUBLE GRAVE +0206; C; 0207; # LATIN CAPITAL LETTER E WITH INVERTED BREVE +0208; C; 0209; # LATIN CAPITAL LETTER I WITH DOUBLE GRAVE +020A; C; 020B; # LATIN CAPITAL LETTER I WITH INVERTED BREVE +020C; C; 020D; # LATIN CAPITAL LETTER O WITH DOUBLE GRAVE +020E; C; 020F; # LATIN CAPITAL LETTER O WITH INVERTED BREVE +0210; C; 0211; # LATIN CAPITAL LETTER R WITH DOUBLE GRAVE +0212; C; 0213; # LATIN CAPITAL LETTER R WITH INVERTED BREVE +0214; C; 0215; # LATIN CAPITAL LETTER U WITH DOUBLE GRAVE +0216; C; 0217; # LATIN CAPITAL LETTER U WITH INVERTED BREVE +0218; C; 0219; # LATIN CAPITAL LETTER S WITH COMMA BELOW +021A; C; 021B; # LATIN CAPITAL LETTER T WITH COMMA BELOW +021C; C; 021D; # LATIN CAPITAL LETTER YOGH +021E; C; 021F; # LATIN CAPITAL LETTER H WITH CARON +0220; C; 019E; # LATIN CAPITAL LETTER N WITH LONG RIGHT LEG +0222; C; 0223; # LATIN CAPITAL LETTER OU +0224; C; 0225; # LATIN CAPITAL LETTER Z WITH HOOK +0226; C; 0227; # LATIN CAPITAL LETTER A WITH DOT ABOVE +0228; C; 0229; # LATIN CAPITAL LETTER E WITH CEDILLA +022A; C; 022B; # LATIN CAPITAL LETTER O WITH DIAERESIS AND MACRON +022C; C; 022D; # LATIN CAPITAL LETTER O WITH TILDE AND MACRON +022E; C; 022F; # LATIN CAPITAL LETTER O WITH DOT ABOVE +0230; C; 0231; # LATIN CAPITAL LETTER O WITH DOT ABOVE AND MACRON +0232; C; 0233; # LATIN CAPITAL LETTER Y WITH MACRON +023A; C; 2C65; # LATIN CAPITAL LETTER A WITH STROKE +023B; C; 023C; # LATIN CAPITAL LETTER C WITH STROKE +023D; C; 019A; # LATIN CAPITAL LETTER L WITH BAR +023E; C; 2C66; # LATIN CAPITAL LETTER T WITH DIAGONAL STROKE +0241; C; 0242; # LATIN CAPITAL LETTER GLOTTAL STOP +0243; C; 0180; # LATIN CAPITAL LETTER B WITH STROKE +0244; C; 0289; # LATIN CAPITAL LETTER U BAR +0245; C; 028C; # LATIN CAPITAL LETTER TURNED V +0246; C; 0247; # LATIN CAPITAL LETTER E WITH STROKE +0248; C; 0249; # LATIN CAPITAL LETTER J WITH STROKE +024A; C; 024B; # LATIN CAPITAL LETTER SMALL Q WITH HOOK TAIL +024C; C; 024D; # LATIN CAPITAL LETTER R WITH STROKE +024E; C; 024F; # LATIN CAPITAL LETTER Y WITH STROKE +0345; C; 03B9; # COMBINING GREEK YPOGEGRAMMENI +0370; C; 0371; # GREEK CAPITAL LETTER HETA +0372; C; 0373; # GREEK CAPITAL LETTER ARCHAIC SAMPI +0376; C; 0377; # GREEK CAPITAL LETTER PAMPHYLIAN DIGAMMA +037F; C; 03F3; # GREEK CAPITAL LETTER YOT +0386; C; 03AC; # GREEK CAPITAL LETTER ALPHA WITH TONOS +0388; C; 03AD; # GREEK CAPITAL LETTER EPSILON WITH TONOS +0389; C; 03AE; # GREEK CAPITAL LETTER ETA WITH TONOS +038A; C; 03AF; # GREEK CAPITAL LETTER IOTA WITH TONOS +038C; C; 03CC; # GREEK CAPITAL LETTER OMICRON WITH TONOS +038E; C; 03CD; # GREEK CAPITAL LETTER UPSILON WITH TONOS +038F; C; 03CE; # GREEK CAPITAL LETTER OMEGA WITH TONOS +0390; F; 03B9 0308 0301; # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS +0391; C; 03B1; # GREEK CAPITAL LETTER ALPHA +0392; C; 03B2; # GREEK CAPITAL LETTER BETA +0393; C; 03B3; # GREEK CAPITAL LETTER GAMMA +0394; C; 03B4; # GREEK CAPITAL LETTER DELTA +0395; C; 03B5; # GREEK CAPITAL LETTER EPSILON +0396; C; 03B6; # GREEK CAPITAL LETTER ZETA +0397; C; 03B7; # GREEK CAPITAL LETTER ETA +0398; C; 03B8; # GREEK CAPITAL LETTER THETA +0399; C; 03B9; # GREEK CAPITAL LETTER IOTA +039A; C; 03BA; # GREEK CAPITAL LETTER KAPPA +039B; C; 03BB; # GREEK CAPITAL LETTER LAMDA +039C; C; 03BC; # GREEK CAPITAL LETTER MU +039D; C; 03BD; # GREEK CAPITAL LETTER NU +039E; C; 03BE; # GREEK CAPITAL LETTER XI +039F; C; 03BF; # GREEK CAPITAL LETTER OMICRON +03A0; C; 03C0; # GREEK CAPITAL LETTER PI +03A1; C; 03C1; # GREEK CAPITAL LETTER RHO +03A3; C; 03C3; # GREEK CAPITAL LETTER SIGMA +03A4; C; 03C4; # GREEK CAPITAL LETTER TAU +03A5; C; 03C5; # GREEK CAPITAL LETTER UPSILON +03A6; C; 03C6; # GREEK CAPITAL LETTER PHI +03A7; C; 03C7; # GREEK CAPITAL LETTER CHI +03A8; C; 03C8; # GREEK CAPITAL LETTER PSI +03A9; C; 03C9; # GREEK CAPITAL LETTER OMEGA +03AA; C; 03CA; # GREEK CAPITAL LETTER IOTA WITH DIALYTIKA +03AB; C; 03CB; # GREEK CAPITAL LETTER UPSILON WITH DIALYTIKA +03B0; F; 03C5 0308 0301; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS +03C2; C; 03C3; # GREEK SMALL LETTER FINAL SIGMA +03CF; C; 03D7; # GREEK CAPITAL KAI SYMBOL +03D0; C; 03B2; # GREEK BETA SYMBOL +03D1; C; 03B8; # GREEK THETA SYMBOL +03D5; C; 03C6; # GREEK PHI SYMBOL +03D6; C; 03C0; # GREEK PI SYMBOL +03D8; C; 03D9; # GREEK LETTER ARCHAIC KOPPA +03DA; C; 03DB; # GREEK LETTER STIGMA +03DC; C; 03DD; # GREEK LETTER DIGAMMA +03DE; C; 03DF; # GREEK LETTER KOPPA +03E0; C; 03E1; # GREEK LETTER SAMPI +03E2; C; 03E3; # COPTIC CAPITAL LETTER SHEI +03E4; C; 03E5; # COPTIC CAPITAL LETTER FEI +03E6; C; 03E7; # COPTIC CAPITAL LETTER KHEI +03E8; C; 03E9; # COPTIC CAPITAL LETTER HORI +03EA; C; 03EB; # COPTIC CAPITAL LETTER GANGIA +03EC; C; 03ED; # COPTIC CAPITAL LETTER SHIMA +03EE; C; 03EF; # COPTIC CAPITAL LETTER DEI +03F0; C; 03BA; # GREEK KAPPA SYMBOL +03F1; C; 03C1; # GREEK RHO SYMBOL +03F4; C; 03B8; # GREEK CAPITAL THETA SYMBOL +03F5; C; 03B5; # GREEK LUNATE EPSILON SYMBOL +03F7; C; 03F8; # GREEK CAPITAL LETTER SHO +03F9; C; 03F2; # GREEK CAPITAL LUNATE SIGMA SYMBOL +03FA; C; 03FB; # GREEK CAPITAL LETTER SAN +03FD; C; 037B; # GREEK CAPITAL REVERSED LUNATE SIGMA SYMBOL +03FE; C; 037C; # GREEK CAPITAL DOTTED LUNATE SIGMA SYMBOL +03FF; C; 037D; # GREEK CAPITAL REVERSED DOTTED LUNATE SIGMA SYMBOL +0400; C; 0450; # CYRILLIC CAPITAL LETTER IE WITH GRAVE +0401; C; 0451; # CYRILLIC CAPITAL LETTER IO +0402; C; 0452; # CYRILLIC CAPITAL LETTER DJE +0403; C; 0453; # CYRILLIC CAPITAL LETTER GJE +0404; C; 0454; # CYRILLIC CAPITAL LETTER UKRAINIAN IE +0405; C; 0455; # CYRILLIC CAPITAL LETTER DZE +0406; C; 0456; # CYRILLIC CAPITAL LETTER BYELORUSSIAN-UKRAINIAN I +0407; C; 0457; # CYRILLIC CAPITAL LETTER YI +0408; C; 0458; # CYRILLIC CAPITAL LETTER JE +0409; C; 0459; # CYRILLIC CAPITAL LETTER LJE +040A; C; 045A; # CYRILLIC CAPITAL LETTER NJE +040B; C; 045B; # CYRILLIC CAPITAL LETTER TSHE +040C; C; 045C; # CYRILLIC CAPITAL LETTER KJE +040D; C; 045D; # CYRILLIC CAPITAL LETTER I WITH GRAVE +040E; C; 045E; # CYRILLIC CAPITAL LETTER SHORT U +040F; C; 045F; # CYRILLIC CAPITAL LETTER DZHE +0410; C; 0430; # CYRILLIC CAPITAL LETTER A +0411; C; 0431; # CYRILLIC CAPITAL LETTER BE +0412; C; 0432; # CYRILLIC CAPITAL LETTER VE +0413; C; 0433; # CYRILLIC CAPITAL LETTER GHE +0414; C; 0434; # CYRILLIC CAPITAL LETTER DE +0415; C; 0435; # CYRILLIC CAPITAL LETTER IE +0416; C; 0436; # CYRILLIC CAPITAL LETTER ZHE +0417; C; 0437; # CYRILLIC CAPITAL LETTER ZE +0418; C; 0438; # CYRILLIC CAPITAL LETTER I +0419; C; 0439; # CYRILLIC CAPITAL LETTER SHORT I +041A; C; 043A; # CYRILLIC CAPITAL LETTER KA +041B; C; 043B; # CYRILLIC CAPITAL LETTER EL +041C; C; 043C; # CYRILLIC CAPITAL LETTER EM +041D; C; 043D; # CYRILLIC CAPITAL LETTER EN +041E; C; 043E; # CYRILLIC CAPITAL LETTER O +041F; C; 043F; # CYRILLIC CAPITAL LETTER PE +0420; C; 0440; # CYRILLIC CAPITAL LETTER ER +0421; C; 0441; # CYRILLIC CAPITAL LETTER ES +0422; C; 0442; # CYRILLIC CAPITAL LETTER TE +0423; C; 0443; # CYRILLIC CAPITAL LETTER U +0424; C; 0444; # CYRILLIC CAPITAL LETTER EF +0425; C; 0445; # CYRILLIC CAPITAL LETTER HA +0426; C; 0446; # CYRILLIC CAPITAL LETTER TSE +0427; C; 0447; # CYRILLIC CAPITAL LETTER CHE +0428; C; 0448; # CYRILLIC CAPITAL LETTER SHA +0429; C; 0449; # CYRILLIC CAPITAL LETTER SHCHA +042A; C; 044A; # CYRILLIC CAPITAL LETTER HARD SIGN +042B; C; 044B; # CYRILLIC CAPITAL LETTER YERU +042C; C; 044C; # CYRILLIC CAPITAL LETTER SOFT SIGN +042D; C; 044D; # CYRILLIC CAPITAL LETTER E +042E; C; 044E; # CYRILLIC CAPITAL LETTER YU +042F; C; 044F; # CYRILLIC CAPITAL LETTER YA +0460; C; 0461; # CYRILLIC CAPITAL LETTER OMEGA +0462; C; 0463; # CYRILLIC CAPITAL LETTER YAT +0464; C; 0465; # CYRILLIC CAPITAL LETTER IOTIFIED E +0466; C; 0467; # CYRILLIC CAPITAL LETTER LITTLE YUS +0468; C; 0469; # CYRILLIC CAPITAL LETTER IOTIFIED LITTLE YUS +046A; C; 046B; # CYRILLIC CAPITAL LETTER BIG YUS +046C; C; 046D; # CYRILLIC CAPITAL LETTER IOTIFIED BIG YUS +046E; C; 046F; # CYRILLIC CAPITAL LETTER KSI +0470; C; 0471; # CYRILLIC CAPITAL LETTER PSI +0472; C; 0473; # CYRILLIC CAPITAL LETTER FITA +0474; C; 0475; # CYRILLIC CAPITAL LETTER IZHITSA +0476; C; 0477; # CYRILLIC CAPITAL LETTER IZHITSA WITH DOUBLE GRAVE ACCENT +0478; C; 0479; # CYRILLIC CAPITAL LETTER UK +047A; C; 047B; # CYRILLIC CAPITAL LETTER ROUND OMEGA +047C; C; 047D; # CYRILLIC CAPITAL LETTER OMEGA WITH TITLO +047E; C; 047F; # CYRILLIC CAPITAL LETTER OT +0480; C; 0481; # CYRILLIC CAPITAL LETTER KOPPA +048A; C; 048B; # CYRILLIC CAPITAL LETTER SHORT I WITH TAIL +048C; C; 048D; # CYRILLIC CAPITAL LETTER SEMISOFT SIGN +048E; C; 048F; # CYRILLIC CAPITAL LETTER ER WITH TICK +0490; C; 0491; # CYRILLIC CAPITAL LETTER GHE WITH UPTURN +0492; C; 0493; # CYRILLIC CAPITAL LETTER GHE WITH STROKE +0494; C; 0495; # CYRILLIC CAPITAL LETTER GHE WITH MIDDLE HOOK +0496; C; 0497; # CYRILLIC CAPITAL LETTER ZHE WITH DESCENDER +0498; C; 0499; # CYRILLIC CAPITAL LETTER ZE WITH DESCENDER +049A; C; 049B; # CYRILLIC CAPITAL LETTER KA WITH DESCENDER +049C; C; 049D; # CYRILLIC CAPITAL LETTER KA WITH VERTICAL STROKE +049E; C; 049F; # CYRILLIC CAPITAL LETTER KA WITH STROKE +04A0; C; 04A1; # CYRILLIC CAPITAL LETTER BASHKIR KA +04A2; C; 04A3; # CYRILLIC CAPITAL LETTER EN WITH DESCENDER +04A4; C; 04A5; # CYRILLIC CAPITAL LIGATURE EN GHE +04A6; C; 04A7; # CYRILLIC CAPITAL LETTER PE WITH MIDDLE HOOK +04A8; C; 04A9; # CYRILLIC CAPITAL LETTER ABKHASIAN HA +04AA; C; 04AB; # CYRILLIC CAPITAL LETTER ES WITH DESCENDER +04AC; C; 04AD; # CYRILLIC CAPITAL LETTER TE WITH DESCENDER +04AE; C; 04AF; # CYRILLIC CAPITAL LETTER STRAIGHT U +04B0; C; 04B1; # CYRILLIC CAPITAL LETTER STRAIGHT U WITH STROKE +04B2; C; 04B3; # CYRILLIC CAPITAL LETTER HA WITH DESCENDER +04B4; C; 04B5; # CYRILLIC CAPITAL LIGATURE TE TSE +04B6; C; 04B7; # CYRILLIC CAPITAL LETTER CHE WITH DESCENDER +04B8; C; 04B9; # CYRILLIC CAPITAL LETTER CHE WITH VERTICAL STROKE +04BA; C; 04BB; # CYRILLIC CAPITAL LETTER SHHA +04BC; C; 04BD; # CYRILLIC CAPITAL LETTER ABKHASIAN CHE +04BE; C; 04BF; # CYRILLIC CAPITAL LETTER ABKHASIAN CHE WITH DESCENDER +04C0; C; 04CF; # CYRILLIC LETTER PALOCHKA +04C1; C; 04C2; # CYRILLIC CAPITAL LETTER ZHE WITH BREVE +04C3; C; 04C4; # CYRILLIC CAPITAL LETTER KA WITH HOOK +04C5; C; 04C6; # CYRILLIC CAPITAL LETTER EL WITH TAIL +04C7; C; 04C8; # CYRILLIC CAPITAL LETTER EN WITH HOOK +04C9; C; 04CA; # CYRILLIC CAPITAL LETTER EN WITH TAIL +04CB; C; 04CC; # CYRILLIC CAPITAL LETTER KHAKASSIAN CHE +04CD; C; 04CE; # CYRILLIC CAPITAL LETTER EM WITH TAIL +04D0; C; 04D1; # CYRILLIC CAPITAL LETTER A WITH BREVE +04D2; C; 04D3; # CYRILLIC CAPITAL LETTER A WITH DIAERESIS +04D4; C; 04D5; # CYRILLIC CAPITAL LIGATURE A IE +04D6; C; 04D7; # CYRILLIC CAPITAL LETTER IE WITH BREVE +04D8; C; 04D9; # CYRILLIC CAPITAL LETTER SCHWA +04DA; C; 04DB; # CYRILLIC CAPITAL LETTER SCHWA WITH DIAERESIS +04DC; C; 04DD; # CYRILLIC CAPITAL LETTER ZHE WITH DIAERESIS +04DE; C; 04DF; # CYRILLIC CAPITAL LETTER ZE WITH DIAERESIS +04E0; C; 04E1; # CYRILLIC CAPITAL LETTER ABKHASIAN DZE +04E2; C; 04E3; # CYRILLIC CAPITAL LETTER I WITH MACRON +04E4; C; 04E5; # CYRILLIC CAPITAL LETTER I WITH DIAERESIS +04E6; C; 04E7; # CYRILLIC CAPITAL LETTER O WITH DIAERESIS +04E8; C; 04E9; # CYRILLIC CAPITAL LETTER BARRED O +04EA; C; 04EB; # CYRILLIC CAPITAL LETTER BARRED O WITH DIAERESIS +04EC; C; 04ED; # CYRILLIC CAPITAL LETTER E WITH DIAERESIS +04EE; C; 04EF; # CYRILLIC CAPITAL LETTER U WITH MACRON +04F0; C; 04F1; # CYRILLIC CAPITAL LETTER U WITH DIAERESIS +04F2; C; 04F3; # CYRILLIC CAPITAL LETTER U WITH DOUBLE ACUTE +04F4; C; 04F5; # CYRILLIC CAPITAL LETTER CHE WITH DIAERESIS +04F6; C; 04F7; # CYRILLIC CAPITAL LETTER GHE WITH DESCENDER +04F8; C; 04F9; # CYRILLIC CAPITAL LETTER YERU WITH DIAERESIS +04FA; C; 04FB; # CYRILLIC CAPITAL LETTER GHE WITH STROKE AND HOOK +04FC; C; 04FD; # CYRILLIC CAPITAL LETTER HA WITH HOOK +04FE; C; 04FF; # CYRILLIC CAPITAL LETTER HA WITH STROKE +0500; C; 0501; # CYRILLIC CAPITAL LETTER KOMI DE +0502; C; 0503; # CYRILLIC CAPITAL LETTER KOMI DJE +0504; C; 0505; # CYRILLIC CAPITAL LETTER KOMI ZJE +0506; C; 0507; # CYRILLIC CAPITAL LETTER KOMI DZJE +0508; C; 0509; # CYRILLIC CAPITAL LETTER KOMI LJE +050A; C; 050B; # CYRILLIC CAPITAL LETTER KOMI NJE +050C; C; 050D; # CYRILLIC CAPITAL LETTER KOMI SJE +050E; C; 050F; # CYRILLIC CAPITAL LETTER KOMI TJE +0510; C; 0511; # CYRILLIC CAPITAL LETTER REVERSED ZE +0512; C; 0513; # CYRILLIC CAPITAL LETTER EL WITH HOOK +0514; C; 0515; # CYRILLIC CAPITAL LETTER LHA +0516; C; 0517; # CYRILLIC CAPITAL LETTER RHA +0518; C; 0519; # CYRILLIC CAPITAL LETTER YAE +051A; C; 051B; # CYRILLIC CAPITAL LETTER QA +051C; C; 051D; # CYRILLIC CAPITAL LETTER WE +051E; C; 051F; # CYRILLIC CAPITAL LETTER ALEUT KA +0520; C; 0521; # CYRILLIC CAPITAL LETTER EL WITH MIDDLE HOOK +0522; C; 0523; # CYRILLIC CAPITAL LETTER EN WITH MIDDLE HOOK +0524; C; 0525; # CYRILLIC CAPITAL LETTER PE WITH DESCENDER +0526; C; 0527; # CYRILLIC CAPITAL LETTER SHHA WITH DESCENDER +0528; C; 0529; # CYRILLIC CAPITAL LETTER EN WITH LEFT HOOK +052A; C; 052B; # CYRILLIC CAPITAL LETTER DZZHE +052C; C; 052D; # CYRILLIC CAPITAL LETTER DCHE +052E; C; 052F; # CYRILLIC CAPITAL LETTER EL WITH DESCENDER +0531; C; 0561; # ARMENIAN CAPITAL LETTER AYB +0532; C; 0562; # ARMENIAN CAPITAL LETTER BEN +0533; C; 0563; # ARMENIAN CAPITAL LETTER GIM +0534; C; 0564; # ARMENIAN CAPITAL LETTER DA +0535; C; 0565; # ARMENIAN CAPITAL LETTER ECH +0536; C; 0566; # ARMENIAN CAPITAL LETTER ZA +0537; C; 0567; # ARMENIAN CAPITAL LETTER EH +0538; C; 0568; # ARMENIAN CAPITAL LETTER ET +0539; C; 0569; # ARMENIAN CAPITAL LETTER TO +053A; C; 056A; # ARMENIAN CAPITAL LETTER ZHE +053B; C; 056B; # ARMENIAN CAPITAL LETTER INI +053C; C; 056C; # ARMENIAN CAPITAL LETTER LIWN +053D; C; 056D; # ARMENIAN CAPITAL LETTER XEH +053E; C; 056E; # ARMENIAN CAPITAL LETTER CA +053F; C; 056F; # ARMENIAN CAPITAL LETTER KEN +0540; C; 0570; # ARMENIAN CAPITAL LETTER HO +0541; C; 0571; # ARMENIAN CAPITAL LETTER JA +0542; C; 0572; # ARMENIAN CAPITAL LETTER GHAD +0543; C; 0573; # ARMENIAN CAPITAL LETTER CHEH +0544; C; 0574; # ARMENIAN CAPITAL LETTER MEN +0545; C; 0575; # ARMENIAN CAPITAL LETTER YI +0546; C; 0576; # ARMENIAN CAPITAL LETTER NOW +0547; C; 0577; # ARMENIAN CAPITAL LETTER SHA +0548; C; 0578; # ARMENIAN CAPITAL LETTER VO +0549; C; 0579; # ARMENIAN CAPITAL LETTER CHA +054A; C; 057A; # ARMENIAN CAPITAL LETTER PEH +054B; C; 057B; # ARMENIAN CAPITAL LETTER JHEH +054C; C; 057C; # ARMENIAN CAPITAL LETTER RA +054D; C; 057D; # ARMENIAN CAPITAL LETTER SEH +054E; C; 057E; # ARMENIAN CAPITAL LETTER VEW +054F; C; 057F; # ARMENIAN CAPITAL LETTER TIWN +0550; C; 0580; # ARMENIAN CAPITAL LETTER REH +0551; C; 0581; # ARMENIAN CAPITAL LETTER CO +0552; C; 0582; # ARMENIAN CAPITAL LETTER YIWN +0553; C; 0583; # ARMENIAN CAPITAL LETTER PIWR +0554; C; 0584; # ARMENIAN CAPITAL LETTER KEH +0555; C; 0585; # ARMENIAN CAPITAL LETTER OH +0556; C; 0586; # ARMENIAN CAPITAL LETTER FEH +0587; F; 0565 0582; # ARMENIAN SMALL LIGATURE ECH YIWN +10A0; C; 2D00; # GEORGIAN CAPITAL LETTER AN +10A1; C; 2D01; # GEORGIAN CAPITAL LETTER BAN +10A2; C; 2D02; # GEORGIAN CAPITAL LETTER GAN +10A3; C; 2D03; # GEORGIAN CAPITAL LETTER DON +10A4; C; 2D04; # GEORGIAN CAPITAL LETTER EN +10A5; C; 2D05; # GEORGIAN CAPITAL LETTER VIN +10A6; C; 2D06; # GEORGIAN CAPITAL LETTER ZEN +10A7; C; 2D07; # GEORGIAN CAPITAL LETTER TAN +10A8; C; 2D08; # GEORGIAN CAPITAL LETTER IN +10A9; C; 2D09; # GEORGIAN CAPITAL LETTER KAN +10AA; C; 2D0A; # GEORGIAN CAPITAL LETTER LAS +10AB; C; 2D0B; # GEORGIAN CAPITAL LETTER MAN +10AC; C; 2D0C; # GEORGIAN CAPITAL LETTER NAR +10AD; C; 2D0D; # GEORGIAN CAPITAL LETTER ON +10AE; C; 2D0E; # GEORGIAN CAPITAL LETTER PAR +10AF; C; 2D0F; # GEORGIAN CAPITAL LETTER ZHAR +10B0; C; 2D10; # GEORGIAN CAPITAL LETTER RAE +10B1; C; 2D11; # GEORGIAN CAPITAL LETTER SAN +10B2; C; 2D12; # GEORGIAN CAPITAL LETTER TAR +10B3; C; 2D13; # GEORGIAN CAPITAL LETTER UN +10B4; C; 2D14; # GEORGIAN CAPITAL LETTER PHAR +10B5; C; 2D15; # GEORGIAN CAPITAL LETTER KHAR +10B6; C; 2D16; # GEORGIAN CAPITAL LETTER GHAN +10B7; C; 2D17; # GEORGIAN CAPITAL LETTER QAR +10B8; C; 2D18; # GEORGIAN CAPITAL LETTER SHIN +10B9; C; 2D19; # GEORGIAN CAPITAL LETTER CHIN +10BA; C; 2D1A; # GEORGIAN CAPITAL LETTER CAN +10BB; C; 2D1B; # GEORGIAN CAPITAL LETTER JIL +10BC; C; 2D1C; # GEORGIAN CAPITAL LETTER CIL +10BD; C; 2D1D; # GEORGIAN CAPITAL LETTER CHAR +10BE; C; 2D1E; # GEORGIAN CAPITAL LETTER XAN +10BF; C; 2D1F; # GEORGIAN CAPITAL LETTER JHAN +10C0; C; 2D20; # GEORGIAN CAPITAL LETTER HAE +10C1; C; 2D21; # GEORGIAN CAPITAL LETTER HE +10C2; C; 2D22; # GEORGIAN CAPITAL LETTER HIE +10C3; C; 2D23; # GEORGIAN CAPITAL LETTER WE +10C4; C; 2D24; # GEORGIAN CAPITAL LETTER HAR +10C5; C; 2D25; # GEORGIAN CAPITAL LETTER HOE +10C7; C; 2D27; # GEORGIAN CAPITAL LETTER YN +10CD; C; 2D2D; # GEORGIAN CAPITAL LETTER AEN +13F8; C; 13F0; # CHEROKEE SMALL LETTER YE +13F9; C; 13F1; # CHEROKEE SMALL LETTER YI +13FA; C; 13F2; # CHEROKEE SMALL LETTER YO +13FB; C; 13F3; # CHEROKEE SMALL LETTER YU +13FC; C; 13F4; # CHEROKEE SMALL LETTER YV +13FD; C; 13F5; # CHEROKEE SMALL LETTER MV +1C80; C; 0432; # CYRILLIC SMALL LETTER ROUNDED VE +1C81; C; 0434; # CYRILLIC SMALL LETTER LONG-LEGGED DE +1C82; C; 043E; # CYRILLIC SMALL LETTER NARROW O +1C83; C; 0441; # CYRILLIC SMALL LETTER WIDE ES +1C84; C; 0442; # CYRILLIC SMALL LETTER TALL TE +1C85; C; 0442; # CYRILLIC SMALL LETTER THREE-LEGGED TE +1C86; C; 044A; # CYRILLIC SMALL LETTER TALL HARD SIGN +1C87; C; 0463; # CYRILLIC SMALL LETTER TALL YAT +1C88; C; A64B; # CYRILLIC SMALL LETTER UNBLENDED UK +1C89; C; 1C8A; # CYRILLIC CAPITAL LETTER TJE +1C90; C; 10D0; # GEORGIAN MTAVRULI CAPITAL LETTER AN +1C91; C; 10D1; # GEORGIAN MTAVRULI CAPITAL LETTER BAN +1C92; C; 10D2; # GEORGIAN MTAVRULI CAPITAL LETTER GAN +1C93; C; 10D3; # GEORGIAN MTAVRULI CAPITAL LETTER DON +1C94; C; 10D4; # GEORGIAN MTAVRULI CAPITAL LETTER EN +1C95; C; 10D5; # GEORGIAN MTAVRULI CAPITAL LETTER VIN +1C96; C; 10D6; # GEORGIAN MTAVRULI CAPITAL LETTER ZEN +1C97; C; 10D7; # GEORGIAN MTAVRULI CAPITAL LETTER TAN +1C98; C; 10D8; # GEORGIAN MTAVRULI CAPITAL LETTER IN +1C99; C; 10D9; # GEORGIAN MTAVRULI CAPITAL LETTER KAN +1C9A; C; 10DA; # GEORGIAN MTAVRULI CAPITAL LETTER LAS +1C9B; C; 10DB; # GEORGIAN MTAVRULI CAPITAL LETTER MAN +1C9C; C; 10DC; # GEORGIAN MTAVRULI CAPITAL LETTER NAR +1C9D; C; 10DD; # GEORGIAN MTAVRULI CAPITAL LETTER ON +1C9E; C; 10DE; # GEORGIAN MTAVRULI CAPITAL LETTER PAR +1C9F; C; 10DF; # GEORGIAN MTAVRULI CAPITAL LETTER ZHAR +1CA0; C; 10E0; # GEORGIAN MTAVRULI CAPITAL LETTER RAE +1CA1; C; 10E1; # GEORGIAN MTAVRULI CAPITAL LETTER SAN +1CA2; C; 10E2; # GEORGIAN MTAVRULI CAPITAL LETTER TAR +1CA3; C; 10E3; # GEORGIAN MTAVRULI CAPITAL LETTER UN +1CA4; C; 10E4; # GEORGIAN MTAVRULI CAPITAL LETTER PHAR +1CA5; C; 10E5; # GEORGIAN MTAVRULI CAPITAL LETTER KHAR +1CA6; C; 10E6; # GEORGIAN MTAVRULI CAPITAL LETTER GHAN +1CA7; C; 10E7; # GEORGIAN MTAVRULI CAPITAL LETTER QAR +1CA8; C; 10E8; # GEORGIAN MTAVRULI CAPITAL LETTER SHIN +1CA9; C; 10E9; # GEORGIAN MTAVRULI CAPITAL LETTER CHIN +1CAA; C; 10EA; # GEORGIAN MTAVRULI CAPITAL LETTER CAN +1CAB; C; 10EB; # GEORGIAN MTAVRULI CAPITAL LETTER JIL +1CAC; C; 10EC; # GEORGIAN MTAVRULI CAPITAL LETTER CIL +1CAD; C; 10ED; # GEORGIAN MTAVRULI CAPITAL LETTER CHAR +1CAE; C; 10EE; # GEORGIAN MTAVRULI CAPITAL LETTER XAN +1CAF; C; 10EF; # GEORGIAN MTAVRULI CAPITAL LETTER JHAN +1CB0; C; 10F0; # GEORGIAN MTAVRULI CAPITAL LETTER HAE +1CB1; C; 10F1; # GEORGIAN MTAVRULI CAPITAL LETTER HE +1CB2; C; 10F2; # GEORGIAN MTAVRULI CAPITAL LETTER HIE +1CB3; C; 10F3; # GEORGIAN MTAVRULI CAPITAL LETTER WE +1CB4; C; 10F4; # GEORGIAN MTAVRULI CAPITAL LETTER HAR +1CB5; C; 10F5; # GEORGIAN MTAVRULI CAPITAL LETTER HOE +1CB6; C; 10F6; # GEORGIAN MTAVRULI CAPITAL LETTER FI +1CB7; C; 10F7; # GEORGIAN MTAVRULI CAPITAL LETTER YN +1CB8; C; 10F8; # GEORGIAN MTAVRULI CAPITAL LETTER ELIFI +1CB9; C; 10F9; # GEORGIAN MTAVRULI CAPITAL LETTER TURNED GAN +1CBA; C; 10FA; # GEORGIAN MTAVRULI CAPITAL LETTER AIN +1CBD; C; 10FD; # GEORGIAN MTAVRULI CAPITAL LETTER AEN +1CBE; C; 10FE; # GEORGIAN MTAVRULI CAPITAL LETTER HARD SIGN +1CBF; C; 10FF; # GEORGIAN MTAVRULI CAPITAL LETTER LABIAL SIGN +1E00; C; 1E01; # LATIN CAPITAL LETTER A WITH RING BELOW +1E02; C; 1E03; # LATIN CAPITAL LETTER B WITH DOT ABOVE +1E04; C; 1E05; # LATIN CAPITAL LETTER B WITH DOT BELOW +1E06; C; 1E07; # LATIN CAPITAL LETTER B WITH LINE BELOW +1E08; C; 1E09; # LATIN CAPITAL LETTER C WITH CEDILLA AND ACUTE +1E0A; C; 1E0B; # LATIN CAPITAL LETTER D WITH DOT ABOVE +1E0C; C; 1E0D; # LATIN CAPITAL LETTER D WITH DOT BELOW +1E0E; C; 1E0F; # LATIN CAPITAL LETTER D WITH LINE BELOW +1E10; C; 1E11; # LATIN CAPITAL LETTER D WITH CEDILLA +1E12; C; 1E13; # LATIN CAPITAL LETTER D WITH CIRCUMFLEX BELOW +1E14; C; 1E15; # LATIN CAPITAL LETTER E WITH MACRON AND GRAVE +1E16; C; 1E17; # LATIN CAPITAL LETTER E WITH MACRON AND ACUTE +1E18; C; 1E19; # LATIN CAPITAL LETTER E WITH CIRCUMFLEX BELOW +1E1A; C; 1E1B; # LATIN CAPITAL LETTER E WITH TILDE BELOW +1E1C; C; 1E1D; # LATIN CAPITAL LETTER E WITH CEDILLA AND BREVE +1E1E; C; 1E1F; # LATIN CAPITAL LETTER F WITH DOT ABOVE +1E20; C; 1E21; # LATIN CAPITAL LETTER G WITH MACRON +1E22; C; 1E23; # LATIN CAPITAL LETTER H WITH DOT ABOVE +1E24; C; 1E25; # LATIN CAPITAL LETTER H WITH DOT BELOW +1E26; C; 1E27; # LATIN CAPITAL LETTER H WITH DIAERESIS +1E28; C; 1E29; # LATIN CAPITAL LETTER H WITH CEDILLA +1E2A; C; 1E2B; # LATIN CAPITAL LETTER H WITH BREVE BELOW +1E2C; C; 1E2D; # LATIN CAPITAL LETTER I WITH TILDE BELOW +1E2E; C; 1E2F; # LATIN CAPITAL LETTER I WITH DIAERESIS AND ACUTE +1E30; C; 1E31; # LATIN CAPITAL LETTER K WITH ACUTE +1E32; C; 1E33; # LATIN CAPITAL LETTER K WITH DOT BELOW +1E34; C; 1E35; # LATIN CAPITAL LETTER K WITH LINE BELOW +1E36; C; 1E37; # LATIN CAPITAL LETTER L WITH DOT BELOW +1E38; C; 1E39; # LATIN CAPITAL LETTER L WITH DOT BELOW AND MACRON +1E3A; C; 1E3B; # LATIN CAPITAL LETTER L WITH LINE BELOW +1E3C; C; 1E3D; # LATIN CAPITAL LETTER L WITH CIRCUMFLEX BELOW +1E3E; C; 1E3F; # LATIN CAPITAL LETTER M WITH ACUTE +1E40; C; 1E41; # LATIN CAPITAL LETTER M WITH DOT ABOVE +1E42; C; 1E43; # LATIN CAPITAL LETTER M WITH DOT BELOW +1E44; C; 1E45; # LATIN CAPITAL LETTER N WITH DOT ABOVE +1E46; C; 1E47; # LATIN CAPITAL LETTER N WITH DOT BELOW +1E48; C; 1E49; # LATIN CAPITAL LETTER N WITH LINE BELOW +1E4A; C; 1E4B; # LATIN CAPITAL LETTER N WITH CIRCUMFLEX BELOW +1E4C; C; 1E4D; # LATIN CAPITAL LETTER O WITH TILDE AND ACUTE +1E4E; C; 1E4F; # LATIN CAPITAL LETTER O WITH TILDE AND DIAERESIS +1E50; C; 1E51; # LATIN CAPITAL LETTER O WITH MACRON AND GRAVE +1E52; C; 1E53; # LATIN CAPITAL LETTER O WITH MACRON AND ACUTE +1E54; C; 1E55; # LATIN CAPITAL LETTER P WITH ACUTE +1E56; C; 1E57; # LATIN CAPITAL LETTER P WITH DOT ABOVE +1E58; C; 1E59; # LATIN CAPITAL LETTER R WITH DOT ABOVE +1E5A; C; 1E5B; # LATIN CAPITAL LETTER R WITH DOT BELOW +1E5C; C; 1E5D; # LATIN CAPITAL LETTER R WITH DOT BELOW AND MACRON +1E5E; C; 1E5F; # LATIN CAPITAL LETTER R WITH LINE BELOW +1E60; C; 1E61; # LATIN CAPITAL LETTER S WITH DOT ABOVE +1E62; C; 1E63; # LATIN CAPITAL LETTER S WITH DOT BELOW +1E64; C; 1E65; # LATIN CAPITAL LETTER S WITH ACUTE AND DOT ABOVE +1E66; C; 1E67; # LATIN CAPITAL LETTER S WITH CARON AND DOT ABOVE +1E68; C; 1E69; # LATIN CAPITAL LETTER S WITH DOT BELOW AND DOT ABOVE +1E6A; C; 1E6B; # LATIN CAPITAL LETTER T WITH DOT ABOVE +1E6C; C; 1E6D; # LATIN CAPITAL LETTER T WITH DOT BELOW +1E6E; C; 1E6F; # LATIN CAPITAL LETTER T WITH LINE BELOW +1E70; C; 1E71; # LATIN CAPITAL LETTER T WITH CIRCUMFLEX BELOW +1E72; C; 1E73; # LATIN CAPITAL LETTER U WITH DIAERESIS BELOW +1E74; C; 1E75; # LATIN CAPITAL LETTER U WITH TILDE BELOW +1E76; C; 1E77; # LATIN CAPITAL LETTER U WITH CIRCUMFLEX BELOW +1E78; C; 1E79; # LATIN CAPITAL LETTER U WITH TILDE AND ACUTE +1E7A; C; 1E7B; # LATIN CAPITAL LETTER U WITH MACRON AND DIAERESIS +1E7C; C; 1E7D; # LATIN CAPITAL LETTER V WITH TILDE +1E7E; C; 1E7F; # LATIN CAPITAL LETTER V WITH DOT BELOW +1E80; C; 1E81; # LATIN CAPITAL LETTER W WITH GRAVE +1E82; C; 1E83; # LATIN CAPITAL LETTER W WITH ACUTE +1E84; C; 1E85; # LATIN CAPITAL LETTER W WITH DIAERESIS +1E86; C; 1E87; # LATIN CAPITAL LETTER W WITH DOT ABOVE +1E88; C; 1E89; # LATIN CAPITAL LETTER W WITH DOT BELOW +1E8A; C; 1E8B; # LATIN CAPITAL LETTER X WITH DOT ABOVE +1E8C; C; 1E8D; # LATIN CAPITAL LETTER X WITH DIAERESIS +1E8E; C; 1E8F; # LATIN CAPITAL LETTER Y WITH DOT ABOVE +1E90; C; 1E91; # LATIN CAPITAL LETTER Z WITH CIRCUMFLEX +1E92; C; 1E93; # LATIN CAPITAL LETTER Z WITH DOT BELOW +1E94; C; 1E95; # LATIN CAPITAL LETTER Z WITH LINE BELOW +1E96; F; 0068 0331; # LATIN SMALL LETTER H WITH LINE BELOW +1E97; F; 0074 0308; # LATIN SMALL LETTER T WITH DIAERESIS +1E98; F; 0077 030A; # LATIN SMALL LETTER W WITH RING ABOVE +1E99; F; 0079 030A; # LATIN SMALL LETTER Y WITH RING ABOVE +1E9A; F; 0061 02BE; # LATIN SMALL LETTER A WITH RIGHT HALF RING +1E9B; C; 1E61; # LATIN SMALL LETTER LONG S WITH DOT ABOVE +1E9E; F; 0073 0073; # LATIN CAPITAL LETTER SHARP S +1E9E; S; 00DF; # LATIN CAPITAL LETTER SHARP S +1EA0; C; 1EA1; # LATIN CAPITAL LETTER A WITH DOT BELOW +1EA2; C; 1EA3; # LATIN CAPITAL LETTER A WITH HOOK ABOVE +1EA4; C; 1EA5; # LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND ACUTE +1EA6; C; 1EA7; # LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND GRAVE +1EA8; C; 1EA9; # LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND HOOK ABOVE +1EAA; C; 1EAB; # LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND TILDE +1EAC; C; 1EAD; # LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND DOT BELOW +1EAE; C; 1EAF; # LATIN CAPITAL LETTER A WITH BREVE AND ACUTE +1EB0; C; 1EB1; # LATIN CAPITAL LETTER A WITH BREVE AND GRAVE +1EB2; C; 1EB3; # LATIN CAPITAL LETTER A WITH BREVE AND HOOK ABOVE +1EB4; C; 1EB5; # LATIN CAPITAL LETTER A WITH BREVE AND TILDE +1EB6; C; 1EB7; # LATIN CAPITAL LETTER A WITH BREVE AND DOT BELOW +1EB8; C; 1EB9; # LATIN CAPITAL LETTER E WITH DOT BELOW +1EBA; C; 1EBB; # LATIN CAPITAL LETTER E WITH HOOK ABOVE +1EBC; C; 1EBD; # LATIN CAPITAL LETTER E WITH TILDE +1EBE; C; 1EBF; # LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND ACUTE +1EC0; C; 1EC1; # LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND GRAVE +1EC2; C; 1EC3; # LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND HOOK ABOVE +1EC4; C; 1EC5; # LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND TILDE +1EC6; C; 1EC7; # LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND DOT BELOW +1EC8; C; 1EC9; # LATIN CAPITAL LETTER I WITH HOOK ABOVE +1ECA; C; 1ECB; # LATIN CAPITAL LETTER I WITH DOT BELOW +1ECC; C; 1ECD; # LATIN CAPITAL LETTER O WITH DOT BELOW +1ECE; C; 1ECF; # LATIN CAPITAL LETTER O WITH HOOK ABOVE +1ED0; C; 1ED1; # LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND ACUTE +1ED2; C; 1ED3; # LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND GRAVE +1ED4; C; 1ED5; # LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND HOOK ABOVE +1ED6; C; 1ED7; # LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND TILDE +1ED8; C; 1ED9; # LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND DOT BELOW +1EDA; C; 1EDB; # LATIN CAPITAL LETTER O WITH HORN AND ACUTE +1EDC; C; 1EDD; # LATIN CAPITAL LETTER O WITH HORN AND GRAVE +1EDE; C; 1EDF; # LATIN CAPITAL LETTER O WITH HORN AND HOOK ABOVE +1EE0; C; 1EE1; # LATIN CAPITAL LETTER O WITH HORN AND TILDE +1EE2; C; 1EE3; # LATIN CAPITAL LETTER O WITH HORN AND DOT BELOW +1EE4; C; 1EE5; # LATIN CAPITAL LETTER U WITH DOT BELOW +1EE6; C; 1EE7; # LATIN CAPITAL LETTER U WITH HOOK ABOVE +1EE8; C; 1EE9; # LATIN CAPITAL LETTER U WITH HORN AND ACUTE +1EEA; C; 1EEB; # LATIN CAPITAL LETTER U WITH HORN AND GRAVE +1EEC; C; 1EED; # LATIN CAPITAL LETTER U WITH HORN AND HOOK ABOVE +1EEE; C; 1EEF; # LATIN CAPITAL LETTER U WITH HORN AND TILDE +1EF0; C; 1EF1; # LATIN CAPITAL LETTER U WITH HORN AND DOT BELOW +1EF2; C; 1EF3; # LATIN CAPITAL LETTER Y WITH GRAVE +1EF4; C; 1EF5; # LATIN CAPITAL LETTER Y WITH DOT BELOW +1EF6; C; 1EF7; # LATIN CAPITAL LETTER Y WITH HOOK ABOVE +1EF8; C; 1EF9; # LATIN CAPITAL LETTER Y WITH TILDE +1EFA; C; 1EFB; # LATIN CAPITAL LETTER MIDDLE-WELSH LL +1EFC; C; 1EFD; # LATIN CAPITAL LETTER MIDDLE-WELSH V +1EFE; C; 1EFF; # LATIN CAPITAL LETTER Y WITH LOOP +1F08; C; 1F00; # GREEK CAPITAL LETTER ALPHA WITH PSILI +1F09; C; 1F01; # GREEK CAPITAL LETTER ALPHA WITH DASIA +1F0A; C; 1F02; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND VARIA +1F0B; C; 1F03; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND VARIA +1F0C; C; 1F04; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND OXIA +1F0D; C; 1F05; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND OXIA +1F0E; C; 1F06; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND PERISPOMENI +1F0F; C; 1F07; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND PERISPOMENI +1F18; C; 1F10; # GREEK CAPITAL LETTER EPSILON WITH PSILI +1F19; C; 1F11; # GREEK CAPITAL LETTER EPSILON WITH DASIA +1F1A; C; 1F12; # GREEK CAPITAL LETTER EPSILON WITH PSILI AND VARIA +1F1B; C; 1F13; # GREEK CAPITAL LETTER EPSILON WITH DASIA AND VARIA +1F1C; C; 1F14; # GREEK CAPITAL LETTER EPSILON WITH PSILI AND OXIA +1F1D; C; 1F15; # GREEK CAPITAL LETTER EPSILON WITH DASIA AND OXIA +1F28; C; 1F20; # GREEK CAPITAL LETTER ETA WITH PSILI +1F29; C; 1F21; # GREEK CAPITAL LETTER ETA WITH DASIA +1F2A; C; 1F22; # GREEK CAPITAL LETTER ETA WITH PSILI AND VARIA +1F2B; C; 1F23; # GREEK CAPITAL LETTER ETA WITH DASIA AND VARIA +1F2C; C; 1F24; # GREEK CAPITAL LETTER ETA WITH PSILI AND OXIA +1F2D; C; 1F25; # GREEK CAPITAL LETTER ETA WITH DASIA AND OXIA +1F2E; C; 1F26; # GREEK CAPITAL LETTER ETA WITH PSILI AND PERISPOMENI +1F2F; C; 1F27; # GREEK CAPITAL LETTER ETA WITH DASIA AND PERISPOMENI +1F38; C; 1F30; # GREEK CAPITAL LETTER IOTA WITH PSILI +1F39; C; 1F31; # GREEK CAPITAL LETTER IOTA WITH DASIA +1F3A; C; 1F32; # GREEK CAPITAL LETTER IOTA WITH PSILI AND VARIA +1F3B; C; 1F33; # GREEK CAPITAL LETTER IOTA WITH DASIA AND VARIA +1F3C; C; 1F34; # GREEK CAPITAL LETTER IOTA WITH PSILI AND OXIA +1F3D; C; 1F35; # GREEK CAPITAL LETTER IOTA WITH DASIA AND OXIA +1F3E; C; 1F36; # GREEK CAPITAL LETTER IOTA WITH PSILI AND PERISPOMENI +1F3F; C; 1F37; # GREEK CAPITAL LETTER IOTA WITH DASIA AND PERISPOMENI +1F48; C; 1F40; # GREEK CAPITAL LETTER OMICRON WITH PSILI +1F49; C; 1F41; # GREEK CAPITAL LETTER OMICRON WITH DASIA +1F4A; C; 1F42; # GREEK CAPITAL LETTER OMICRON WITH PSILI AND VARIA +1F4B; C; 1F43; # GREEK CAPITAL LETTER OMICRON WITH DASIA AND VARIA +1F4C; C; 1F44; # GREEK CAPITAL LETTER OMICRON WITH PSILI AND OXIA +1F4D; C; 1F45; # GREEK CAPITAL LETTER OMICRON WITH DASIA AND OXIA +1F50; F; 03C5 0313; # GREEK SMALL LETTER UPSILON WITH PSILI +1F52; F; 03C5 0313 0300; # GREEK SMALL LETTER UPSILON WITH PSILI AND VARIA +1F54; F; 03C5 0313 0301; # GREEK SMALL LETTER UPSILON WITH PSILI AND OXIA +1F56; F; 03C5 0313 0342; # GREEK SMALL LETTER UPSILON WITH PSILI AND PERISPOMENI +1F59; C; 1F51; # GREEK CAPITAL LETTER UPSILON WITH DASIA +1F5B; C; 1F53; # GREEK CAPITAL LETTER UPSILON WITH DASIA AND VARIA +1F5D; C; 1F55; # GREEK CAPITAL LETTER UPSILON WITH DASIA AND OXIA +1F5F; C; 1F57; # GREEK CAPITAL LETTER UPSILON WITH DASIA AND PERISPOMENI +1F68; C; 1F60; # GREEK CAPITAL LETTER OMEGA WITH PSILI +1F69; C; 1F61; # GREEK CAPITAL LETTER OMEGA WITH DASIA +1F6A; C; 1F62; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND VARIA +1F6B; C; 1F63; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND VARIA +1F6C; C; 1F64; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND OXIA +1F6D; C; 1F65; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND OXIA +1F6E; C; 1F66; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND PERISPOMENI +1F6F; C; 1F67; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND PERISPOMENI +1F80; F; 1F00 03B9; # GREEK SMALL LETTER ALPHA WITH PSILI AND YPOGEGRAMMENI +1F81; F; 1F01 03B9; # GREEK SMALL LETTER ALPHA WITH DASIA AND YPOGEGRAMMENI +1F82; F; 1F02 03B9; # GREEK SMALL LETTER ALPHA WITH PSILI AND VARIA AND YPOGEGRAMMENI +1F83; F; 1F03 03B9; # GREEK SMALL LETTER ALPHA WITH DASIA AND VARIA AND YPOGEGRAMMENI +1F84; F; 1F04 03B9; # GREEK SMALL LETTER ALPHA WITH PSILI AND OXIA AND YPOGEGRAMMENI +1F85; F; 1F05 03B9; # GREEK SMALL LETTER ALPHA WITH DASIA AND OXIA AND YPOGEGRAMMENI +1F86; F; 1F06 03B9; # GREEK SMALL LETTER ALPHA WITH PSILI AND PERISPOMENI AND YPOGEGRAMMENI +1F87; F; 1F07 03B9; # GREEK SMALL LETTER ALPHA WITH DASIA AND PERISPOMENI AND YPOGEGRAMMENI +1F88; F; 1F00 03B9; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND PROSGEGRAMMENI +1F88; S; 1F80; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND PROSGEGRAMMENI +1F89; F; 1F01 03B9; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND PROSGEGRAMMENI +1F89; S; 1F81; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND PROSGEGRAMMENI +1F8A; F; 1F02 03B9; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND VARIA AND PROSGEGRAMMENI +1F8A; S; 1F82; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND VARIA AND PROSGEGRAMMENI +1F8B; F; 1F03 03B9; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND VARIA AND PROSGEGRAMMENI +1F8B; S; 1F83; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND VARIA AND PROSGEGRAMMENI +1F8C; F; 1F04 03B9; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND OXIA AND PROSGEGRAMMENI +1F8C; S; 1F84; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND OXIA AND PROSGEGRAMMENI +1F8D; F; 1F05 03B9; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND OXIA AND PROSGEGRAMMENI +1F8D; S; 1F85; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND OXIA AND PROSGEGRAMMENI +1F8E; F; 1F06 03B9; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI +1F8E; S; 1F86; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI +1F8F; F; 1F07 03B9; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI +1F8F; S; 1F87; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI +1F90; F; 1F20 03B9; # GREEK SMALL LETTER ETA WITH PSILI AND YPOGEGRAMMENI +1F91; F; 1F21 03B9; # GREEK SMALL LETTER ETA WITH DASIA AND YPOGEGRAMMENI +1F92; F; 1F22 03B9; # GREEK SMALL LETTER ETA WITH PSILI AND VARIA AND YPOGEGRAMMENI +1F93; F; 1F23 03B9; # GREEK SMALL LETTER ETA WITH DASIA AND VARIA AND YPOGEGRAMMENI +1F94; F; 1F24 03B9; # GREEK SMALL LETTER ETA WITH PSILI AND OXIA AND YPOGEGRAMMENI +1F95; F; 1F25 03B9; # GREEK SMALL LETTER ETA WITH DASIA AND OXIA AND YPOGEGRAMMENI +1F96; F; 1F26 03B9; # GREEK SMALL LETTER ETA WITH PSILI AND PERISPOMENI AND YPOGEGRAMMENI +1F97; F; 1F27 03B9; # GREEK SMALL LETTER ETA WITH DASIA AND PERISPOMENI AND YPOGEGRAMMENI +1F98; F; 1F20 03B9; # GREEK CAPITAL LETTER ETA WITH PSILI AND PROSGEGRAMMENI +1F98; S; 1F90; # GREEK CAPITAL LETTER ETA WITH PSILI AND PROSGEGRAMMENI +1F99; F; 1F21 03B9; # GREEK CAPITAL LETTER ETA WITH DASIA AND PROSGEGRAMMENI +1F99; S; 1F91; # GREEK CAPITAL LETTER ETA WITH DASIA AND PROSGEGRAMMENI +1F9A; F; 1F22 03B9; # GREEK CAPITAL LETTER ETA WITH PSILI AND VARIA AND PROSGEGRAMMENI +1F9A; S; 1F92; # GREEK CAPITAL LETTER ETA WITH PSILI AND VARIA AND PROSGEGRAMMENI +1F9B; F; 1F23 03B9; # GREEK CAPITAL LETTER ETA WITH DASIA AND VARIA AND PROSGEGRAMMENI +1F9B; S; 1F93; # GREEK CAPITAL LETTER ETA WITH DASIA AND VARIA AND PROSGEGRAMMENI +1F9C; F; 1F24 03B9; # GREEK CAPITAL LETTER ETA WITH PSILI AND OXIA AND PROSGEGRAMMENI +1F9C; S; 1F94; # GREEK CAPITAL LETTER ETA WITH PSILI AND OXIA AND PROSGEGRAMMENI +1F9D; F; 1F25 03B9; # GREEK CAPITAL LETTER ETA WITH DASIA AND OXIA AND PROSGEGRAMMENI +1F9D; S; 1F95; # GREEK CAPITAL LETTER ETA WITH DASIA AND OXIA AND PROSGEGRAMMENI +1F9E; F; 1F26 03B9; # GREEK CAPITAL LETTER ETA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI +1F9E; S; 1F96; # GREEK CAPITAL LETTER ETA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI +1F9F; F; 1F27 03B9; # GREEK CAPITAL LETTER ETA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI +1F9F; S; 1F97; # GREEK CAPITAL LETTER ETA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI +1FA0; F; 1F60 03B9; # GREEK SMALL LETTER OMEGA WITH PSILI AND YPOGEGRAMMENI +1FA1; F; 1F61 03B9; # GREEK SMALL LETTER OMEGA WITH DASIA AND YPOGEGRAMMENI +1FA2; F; 1F62 03B9; # GREEK SMALL LETTER OMEGA WITH PSILI AND VARIA AND YPOGEGRAMMENI +1FA3; F; 1F63 03B9; # GREEK SMALL LETTER OMEGA WITH DASIA AND VARIA AND YPOGEGRAMMENI +1FA4; F; 1F64 03B9; # GREEK SMALL LETTER OMEGA WITH PSILI AND OXIA AND YPOGEGRAMMENI +1FA5; F; 1F65 03B9; # GREEK SMALL LETTER OMEGA WITH DASIA AND OXIA AND YPOGEGRAMMENI +1FA6; F; 1F66 03B9; # GREEK SMALL LETTER OMEGA WITH PSILI AND PERISPOMENI AND YPOGEGRAMMENI +1FA7; F; 1F67 03B9; # GREEK SMALL LETTER OMEGA WITH DASIA AND PERISPOMENI AND YPOGEGRAMMENI +1FA8; F; 1F60 03B9; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND PROSGEGRAMMENI +1FA8; S; 1FA0; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND PROSGEGRAMMENI +1FA9; F; 1F61 03B9; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND PROSGEGRAMMENI +1FA9; S; 1FA1; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND PROSGEGRAMMENI +1FAA; F; 1F62 03B9; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND VARIA AND PROSGEGRAMMENI +1FAA; S; 1FA2; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND VARIA AND PROSGEGRAMMENI +1FAB; F; 1F63 03B9; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND VARIA AND PROSGEGRAMMENI +1FAB; S; 1FA3; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND VARIA AND PROSGEGRAMMENI +1FAC; F; 1F64 03B9; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND OXIA AND PROSGEGRAMMENI +1FAC; S; 1FA4; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND OXIA AND PROSGEGRAMMENI +1FAD; F; 1F65 03B9; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND OXIA AND PROSGEGRAMMENI +1FAD; S; 1FA5; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND OXIA AND PROSGEGRAMMENI +1FAE; F; 1F66 03B9; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI +1FAE; S; 1FA6; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI +1FAF; F; 1F67 03B9; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI +1FAF; S; 1FA7; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI +1FB2; F; 1F70 03B9; # GREEK SMALL LETTER ALPHA WITH VARIA AND YPOGEGRAMMENI +1FB3; F; 03B1 03B9; # GREEK SMALL LETTER ALPHA WITH YPOGEGRAMMENI +1FB4; F; 03AC 03B9; # GREEK SMALL LETTER ALPHA WITH OXIA AND YPOGEGRAMMENI +1FB6; F; 03B1 0342; # GREEK SMALL LETTER ALPHA WITH PERISPOMENI +1FB7; F; 03B1 0342 03B9; # GREEK SMALL LETTER ALPHA WITH PERISPOMENI AND YPOGEGRAMMENI +1FB8; C; 1FB0; # GREEK CAPITAL LETTER ALPHA WITH VRACHY +1FB9; C; 1FB1; # GREEK CAPITAL LETTER ALPHA WITH MACRON +1FBA; C; 1F70; # GREEK CAPITAL LETTER ALPHA WITH VARIA +1FBB; C; 1F71; # GREEK CAPITAL LETTER ALPHA WITH OXIA +1FBC; F; 03B1 03B9; # GREEK CAPITAL LETTER ALPHA WITH PROSGEGRAMMENI +1FBC; S; 1FB3; # GREEK CAPITAL LETTER ALPHA WITH PROSGEGRAMMENI +1FBE; C; 03B9; # GREEK PROSGEGRAMMENI +1FC2; F; 1F74 03B9; # GREEK SMALL LETTER ETA WITH VARIA AND YPOGEGRAMMENI +1FC3; F; 03B7 03B9; # GREEK SMALL LETTER ETA WITH YPOGEGRAMMENI +1FC4; F; 03AE 03B9; # GREEK SMALL LETTER ETA WITH OXIA AND YPOGEGRAMMENI +1FC6; F; 03B7 0342; # GREEK SMALL LETTER ETA WITH PERISPOMENI +1FC7; F; 03B7 0342 03B9; # GREEK SMALL LETTER ETA WITH PERISPOMENI AND YPOGEGRAMMENI +1FC8; C; 1F72; # GREEK CAPITAL LETTER EPSILON WITH VARIA +1FC9; C; 1F73; # GREEK CAPITAL LETTER EPSILON WITH OXIA +1FCA; C; 1F74; # GREEK CAPITAL LETTER ETA WITH VARIA +1FCB; C; 1F75; # GREEK CAPITAL LETTER ETA WITH OXIA +1FCC; F; 03B7 03B9; # GREEK CAPITAL LETTER ETA WITH PROSGEGRAMMENI +1FCC; S; 1FC3; # GREEK CAPITAL LETTER ETA WITH PROSGEGRAMMENI +1FD2; F; 03B9 0308 0300; # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND VARIA +1FD3; F; 03B9 0308 0301; # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA +1FD3; S; 0390; # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA +1FD6; F; 03B9 0342; # GREEK SMALL LETTER IOTA WITH PERISPOMENI +1FD7; F; 03B9 0308 0342; # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND PERISPOMENI +1FD8; C; 1FD0; # GREEK CAPITAL LETTER IOTA WITH VRACHY +1FD9; C; 1FD1; # GREEK CAPITAL LETTER IOTA WITH MACRON +1FDA; C; 1F76; # GREEK CAPITAL LETTER IOTA WITH VARIA +1FDB; C; 1F77; # GREEK CAPITAL LETTER IOTA WITH OXIA +1FE2; F; 03C5 0308 0300; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND VARIA +1FE3; F; 03C5 0308 0301; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA +1FE3; S; 03B0; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA +1FE4; F; 03C1 0313; # GREEK SMALL LETTER RHO WITH PSILI +1FE6; F; 03C5 0342; # GREEK SMALL LETTER UPSILON WITH PERISPOMENI +1FE7; F; 03C5 0308 0342; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND PERISPOMENI +1FE8; C; 1FE0; # GREEK CAPITAL LETTER UPSILON WITH VRACHY +1FE9; C; 1FE1; # GREEK CAPITAL LETTER UPSILON WITH MACRON +1FEA; C; 1F7A; # GREEK CAPITAL LETTER UPSILON WITH VARIA +1FEB; C; 1F7B; # GREEK CAPITAL LETTER UPSILON WITH OXIA +1FEC; C; 1FE5; # GREEK CAPITAL LETTER RHO WITH DASIA +1FF2; F; 1F7C 03B9; # GREEK SMALL LETTER OMEGA WITH VARIA AND YPOGEGRAMMENI +1FF3; F; 03C9 03B9; # GREEK SMALL LETTER OMEGA WITH YPOGEGRAMMENI +1FF4; F; 03CE 03B9; # GREEK SMALL LETTER OMEGA WITH OXIA AND YPOGEGRAMMENI +1FF6; F; 03C9 0342; # GREEK SMALL LETTER OMEGA WITH PERISPOMENI +1FF7; F; 03C9 0342 03B9; # GREEK SMALL LETTER OMEGA WITH PERISPOMENI AND YPOGEGRAMMENI +1FF8; C; 1F78; # GREEK CAPITAL LETTER OMICRON WITH VARIA +1FF9; C; 1F79; # GREEK CAPITAL LETTER OMICRON WITH OXIA +1FFA; C; 1F7C; # GREEK CAPITAL LETTER OMEGA WITH VARIA +1FFB; C; 1F7D; # GREEK CAPITAL LETTER OMEGA WITH OXIA +1FFC; F; 03C9 03B9; # GREEK CAPITAL LETTER OMEGA WITH PROSGEGRAMMENI +1FFC; S; 1FF3; # GREEK CAPITAL LETTER OMEGA WITH PROSGEGRAMMENI +2126; C; 03C9; # OHM SIGN +212A; C; 006B; # KELVIN SIGN +212B; C; 00E5; # ANGSTROM SIGN +2132; C; 214E; # TURNED CAPITAL F +2160; C; 2170; # ROMAN NUMERAL ONE +2161; C; 2171; # ROMAN NUMERAL TWO +2162; C; 2172; # ROMAN NUMERAL THREE +2163; C; 2173; # ROMAN NUMERAL FOUR +2164; C; 2174; # ROMAN NUMERAL FIVE +2165; C; 2175; # ROMAN NUMERAL SIX +2166; C; 2176; # ROMAN NUMERAL SEVEN +2167; C; 2177; # ROMAN NUMERAL EIGHT +2168; C; 2178; # ROMAN NUMERAL NINE +2169; C; 2179; # ROMAN NUMERAL TEN +216A; C; 217A; # ROMAN NUMERAL ELEVEN +216B; C; 217B; # ROMAN NUMERAL TWELVE +216C; C; 217C; # ROMAN NUMERAL FIFTY +216D; C; 217D; # ROMAN NUMERAL ONE HUNDRED +216E; C; 217E; # ROMAN NUMERAL FIVE HUNDRED +216F; C; 217F; # ROMAN NUMERAL ONE THOUSAND +2183; C; 2184; # ROMAN NUMERAL REVERSED ONE HUNDRED +24B6; C; 24D0; # CIRCLED LATIN CAPITAL LETTER A +24B7; C; 24D1; # CIRCLED LATIN CAPITAL LETTER B +24B8; C; 24D2; # CIRCLED LATIN CAPITAL LETTER C +24B9; C; 24D3; # CIRCLED LATIN CAPITAL LETTER D +24BA; C; 24D4; # CIRCLED LATIN CAPITAL LETTER E +24BB; C; 24D5; # CIRCLED LATIN CAPITAL LETTER F +24BC; C; 24D6; # CIRCLED LATIN CAPITAL LETTER G +24BD; C; 24D7; # CIRCLED LATIN CAPITAL LETTER H +24BE; C; 24D8; # CIRCLED LATIN CAPITAL LETTER I +24BF; C; 24D9; # CIRCLED LATIN CAPITAL LETTER J +24C0; C; 24DA; # CIRCLED LATIN CAPITAL LETTER K +24C1; C; 24DB; # CIRCLED LATIN CAPITAL LETTER L +24C2; C; 24DC; # CIRCLED LATIN CAPITAL LETTER M +24C3; C; 24DD; # CIRCLED LATIN CAPITAL LETTER N +24C4; C; 24DE; # CIRCLED LATIN CAPITAL LETTER O +24C5; C; 24DF; # CIRCLED LATIN CAPITAL LETTER P +24C6; C; 24E0; # CIRCLED LATIN CAPITAL LETTER Q +24C7; C; 24E1; # CIRCLED LATIN CAPITAL LETTER R +24C8; C; 24E2; # CIRCLED LATIN CAPITAL LETTER S +24C9; C; 24E3; # CIRCLED LATIN CAPITAL LETTER T +24CA; C; 24E4; # CIRCLED LATIN CAPITAL LETTER U +24CB; C; 24E5; # CIRCLED LATIN CAPITAL LETTER V +24CC; C; 24E6; # CIRCLED LATIN CAPITAL LETTER W +24CD; C; 24E7; # CIRCLED LATIN CAPITAL LETTER X +24CE; C; 24E8; # CIRCLED LATIN CAPITAL LETTER Y +24CF; C; 24E9; # CIRCLED LATIN CAPITAL LETTER Z +2C00; C; 2C30; # GLAGOLITIC CAPITAL LETTER AZU +2C01; C; 2C31; # GLAGOLITIC CAPITAL LETTER BUKY +2C02; C; 2C32; # GLAGOLITIC CAPITAL LETTER VEDE +2C03; C; 2C33; # GLAGOLITIC CAPITAL LETTER GLAGOLI +2C04; C; 2C34; # GLAGOLITIC CAPITAL LETTER DOBRO +2C05; C; 2C35; # GLAGOLITIC CAPITAL LETTER YESTU +2C06; C; 2C36; # GLAGOLITIC CAPITAL LETTER ZHIVETE +2C07; C; 2C37; # GLAGOLITIC CAPITAL LETTER DZELO +2C08; C; 2C38; # GLAGOLITIC CAPITAL LETTER ZEMLJA +2C09; C; 2C39; # GLAGOLITIC CAPITAL LETTER IZHE +2C0A; C; 2C3A; # GLAGOLITIC CAPITAL LETTER INITIAL IZHE +2C0B; C; 2C3B; # GLAGOLITIC CAPITAL LETTER I +2C0C; C; 2C3C; # GLAGOLITIC CAPITAL LETTER DJERVI +2C0D; C; 2C3D; # GLAGOLITIC CAPITAL LETTER KAKO +2C0E; C; 2C3E; # GLAGOLITIC CAPITAL LETTER LJUDIJE +2C0F; C; 2C3F; # GLAGOLITIC CAPITAL LETTER MYSLITE +2C10; C; 2C40; # GLAGOLITIC CAPITAL LETTER NASHI +2C11; C; 2C41; # GLAGOLITIC CAPITAL LETTER ONU +2C12; C; 2C42; # GLAGOLITIC CAPITAL LETTER POKOJI +2C13; C; 2C43; # GLAGOLITIC CAPITAL LETTER RITSI +2C14; C; 2C44; # GLAGOLITIC CAPITAL LETTER SLOVO +2C15; C; 2C45; # GLAGOLITIC CAPITAL LETTER TVRIDO +2C16; C; 2C46; # GLAGOLITIC CAPITAL LETTER UKU +2C17; C; 2C47; # GLAGOLITIC CAPITAL LETTER FRITU +2C18; C; 2C48; # GLAGOLITIC CAPITAL LETTER HERU +2C19; C; 2C49; # GLAGOLITIC CAPITAL LETTER OTU +2C1A; C; 2C4A; # GLAGOLITIC CAPITAL LETTER PE +2C1B; C; 2C4B; # GLAGOLITIC CAPITAL LETTER SHTA +2C1C; C; 2C4C; # GLAGOLITIC CAPITAL LETTER TSI +2C1D; C; 2C4D; # GLAGOLITIC CAPITAL LETTER CHRIVI +2C1E; C; 2C4E; # GLAGOLITIC CAPITAL LETTER SHA +2C1F; C; 2C4F; # GLAGOLITIC CAPITAL LETTER YERU +2C20; C; 2C50; # GLAGOLITIC CAPITAL LETTER YERI +2C21; C; 2C51; # GLAGOLITIC CAPITAL LETTER YATI +2C22; C; 2C52; # GLAGOLITIC CAPITAL LETTER SPIDERY HA +2C23; C; 2C53; # GLAGOLITIC CAPITAL LETTER YU +2C24; C; 2C54; # GLAGOLITIC CAPITAL LETTER SMALL YUS +2C25; C; 2C55; # GLAGOLITIC CAPITAL LETTER SMALL YUS WITH TAIL +2C26; C; 2C56; # GLAGOLITIC CAPITAL LETTER YO +2C27; C; 2C57; # GLAGOLITIC CAPITAL LETTER IOTATED SMALL YUS +2C28; C; 2C58; # GLAGOLITIC CAPITAL LETTER BIG YUS +2C29; C; 2C59; # GLAGOLITIC CAPITAL LETTER IOTATED BIG YUS +2C2A; C; 2C5A; # GLAGOLITIC CAPITAL LETTER FITA +2C2B; C; 2C5B; # GLAGOLITIC CAPITAL LETTER IZHITSA +2C2C; C; 2C5C; # GLAGOLITIC CAPITAL LETTER SHTAPIC +2C2D; C; 2C5D; # GLAGOLITIC CAPITAL LETTER TROKUTASTI A +2C2E; C; 2C5E; # GLAGOLITIC CAPITAL LETTER LATINATE MYSLITE +2C2F; C; 2C5F; # GLAGOLITIC CAPITAL LETTER CAUDATE CHRIVI +2C60; C; 2C61; # LATIN CAPITAL LETTER L WITH DOUBLE BAR +2C62; C; 026B; # LATIN CAPITAL LETTER L WITH MIDDLE TILDE +2C63; C; 1D7D; # LATIN CAPITAL LETTER P WITH STROKE +2C64; C; 027D; # LATIN CAPITAL LETTER R WITH TAIL +2C67; C; 2C68; # LATIN CAPITAL LETTER H WITH DESCENDER +2C69; C; 2C6A; # LATIN CAPITAL LETTER K WITH DESCENDER +2C6B; C; 2C6C; # LATIN CAPITAL LETTER Z WITH DESCENDER +2C6D; C; 0251; # LATIN CAPITAL LETTER ALPHA +2C6E; C; 0271; # LATIN CAPITAL LETTER M WITH HOOK +2C6F; C; 0250; # LATIN CAPITAL LETTER TURNED A +2C70; C; 0252; # LATIN CAPITAL LETTER TURNED ALPHA +2C72; C; 2C73; # LATIN CAPITAL LETTER W WITH HOOK +2C75; C; 2C76; # LATIN CAPITAL LETTER HALF H +2C7E; C; 023F; # LATIN CAPITAL LETTER S WITH SWASH TAIL +2C7F; C; 0240; # LATIN CAPITAL LETTER Z WITH SWASH TAIL +2C80; C; 2C81; # COPTIC CAPITAL LETTER ALFA +2C82; C; 2C83; # COPTIC CAPITAL LETTER VIDA +2C84; C; 2C85; # COPTIC CAPITAL LETTER GAMMA +2C86; C; 2C87; # COPTIC CAPITAL LETTER DALDA +2C88; C; 2C89; # COPTIC CAPITAL LETTER EIE +2C8A; C; 2C8B; # COPTIC CAPITAL LETTER SOU +2C8C; C; 2C8D; # COPTIC CAPITAL LETTER ZATA +2C8E; C; 2C8F; # COPTIC CAPITAL LETTER HATE +2C90; C; 2C91; # COPTIC CAPITAL LETTER THETHE +2C92; C; 2C93; # COPTIC CAPITAL LETTER IAUDA +2C94; C; 2C95; # COPTIC CAPITAL LETTER KAPA +2C96; C; 2C97; # COPTIC CAPITAL LETTER LAULA +2C98; C; 2C99; # COPTIC CAPITAL LETTER MI +2C9A; C; 2C9B; # COPTIC CAPITAL LETTER NI +2C9C; C; 2C9D; # COPTIC CAPITAL LETTER KSI +2C9E; C; 2C9F; # COPTIC CAPITAL LETTER O +2CA0; C; 2CA1; # COPTIC CAPITAL LETTER PI +2CA2; C; 2CA3; # COPTIC CAPITAL LETTER RO +2CA4; C; 2CA5; # COPTIC CAPITAL LETTER SIMA +2CA6; C; 2CA7; # COPTIC CAPITAL LETTER TAU +2CA8; C; 2CA9; # COPTIC CAPITAL LETTER UA +2CAA; C; 2CAB; # COPTIC CAPITAL LETTER FI +2CAC; C; 2CAD; # COPTIC CAPITAL LETTER KHI +2CAE; C; 2CAF; # COPTIC CAPITAL LETTER PSI +2CB0; C; 2CB1; # COPTIC CAPITAL LETTER OOU +2CB2; C; 2CB3; # COPTIC CAPITAL LETTER DIALECT-P ALEF +2CB4; C; 2CB5; # COPTIC CAPITAL LETTER OLD COPTIC AIN +2CB6; C; 2CB7; # COPTIC CAPITAL LETTER CRYPTOGRAMMIC EIE +2CB8; C; 2CB9; # COPTIC CAPITAL LETTER DIALECT-P KAPA +2CBA; C; 2CBB; # COPTIC CAPITAL LETTER DIALECT-P NI +2CBC; C; 2CBD; # COPTIC CAPITAL LETTER CRYPTOGRAMMIC NI +2CBE; C; 2CBF; # COPTIC CAPITAL LETTER OLD COPTIC OOU +2CC0; C; 2CC1; # COPTIC CAPITAL LETTER SAMPI +2CC2; C; 2CC3; # COPTIC CAPITAL LETTER CROSSED SHEI +2CC4; C; 2CC5; # COPTIC CAPITAL LETTER OLD COPTIC SHEI +2CC6; C; 2CC7; # COPTIC CAPITAL LETTER OLD COPTIC ESH +2CC8; C; 2CC9; # COPTIC CAPITAL LETTER AKHMIMIC KHEI +2CCA; C; 2CCB; # COPTIC CAPITAL LETTER DIALECT-P HORI +2CCC; C; 2CCD; # COPTIC CAPITAL LETTER OLD COPTIC HORI +2CCE; C; 2CCF; # COPTIC CAPITAL LETTER OLD COPTIC HA +2CD0; C; 2CD1; # COPTIC CAPITAL LETTER L-SHAPED HA +2CD2; C; 2CD3; # COPTIC CAPITAL LETTER OLD COPTIC HEI +2CD4; C; 2CD5; # COPTIC CAPITAL LETTER OLD COPTIC HAT +2CD6; C; 2CD7; # COPTIC CAPITAL LETTER OLD COPTIC GANGIA +2CD8; C; 2CD9; # COPTIC CAPITAL LETTER OLD COPTIC DJA +2CDA; C; 2CDB; # COPTIC CAPITAL LETTER OLD COPTIC SHIMA +2CDC; C; 2CDD; # COPTIC CAPITAL LETTER OLD NUBIAN SHIMA +2CDE; C; 2CDF; # COPTIC CAPITAL LETTER OLD NUBIAN NGI +2CE0; C; 2CE1; # COPTIC CAPITAL LETTER OLD NUBIAN NYI +2CE2; C; 2CE3; # COPTIC CAPITAL LETTER OLD NUBIAN WAU +2CEB; C; 2CEC; # COPTIC CAPITAL LETTER CRYPTOGRAMMIC SHEI +2CED; C; 2CEE; # COPTIC CAPITAL LETTER CRYPTOGRAMMIC GANGIA +2CF2; C; 2CF3; # COPTIC CAPITAL LETTER BOHAIRIC KHEI +A640; C; A641; # CYRILLIC CAPITAL LETTER ZEMLYA +A642; C; A643; # CYRILLIC CAPITAL LETTER DZELO +A644; C; A645; # CYRILLIC CAPITAL LETTER REVERSED DZE +A646; C; A647; # CYRILLIC CAPITAL LETTER IOTA +A648; C; A649; # CYRILLIC CAPITAL LETTER DJERV +A64A; C; A64B; # CYRILLIC CAPITAL LETTER MONOGRAPH UK +A64C; C; A64D; # CYRILLIC CAPITAL LETTER BROAD OMEGA +A64E; C; A64F; # CYRILLIC CAPITAL LETTER NEUTRAL YER +A650; C; A651; # CYRILLIC CAPITAL LETTER YERU WITH BACK YER +A652; C; A653; # CYRILLIC CAPITAL LETTER IOTIFIED YAT +A654; C; A655; # CYRILLIC CAPITAL LETTER REVERSED YU +A656; C; A657; # CYRILLIC CAPITAL LETTER IOTIFIED A +A658; C; A659; # CYRILLIC CAPITAL LETTER CLOSED LITTLE YUS +A65A; C; A65B; # CYRILLIC CAPITAL LETTER BLENDED YUS +A65C; C; A65D; # CYRILLIC CAPITAL LETTER IOTIFIED CLOSED LITTLE YUS +A65E; C; A65F; # CYRILLIC CAPITAL LETTER YN +A660; C; A661; # CYRILLIC CAPITAL LETTER REVERSED TSE +A662; C; A663; # CYRILLIC CAPITAL LETTER SOFT DE +A664; C; A665; # CYRILLIC CAPITAL LETTER SOFT EL +A666; C; A667; # CYRILLIC CAPITAL LETTER SOFT EM +A668; C; A669; # CYRILLIC CAPITAL LETTER MONOCULAR O +A66A; C; A66B; # CYRILLIC CAPITAL LETTER BINOCULAR O +A66C; C; A66D; # CYRILLIC CAPITAL LETTER DOUBLE MONOCULAR O +A680; C; A681; # CYRILLIC CAPITAL LETTER DWE +A682; C; A683; # CYRILLIC CAPITAL LETTER DZWE +A684; C; A685; # CYRILLIC CAPITAL LETTER ZHWE +A686; C; A687; # CYRILLIC CAPITAL LETTER CCHE +A688; C; A689; # CYRILLIC CAPITAL LETTER DZZE +A68A; C; A68B; # CYRILLIC CAPITAL LETTER TE WITH MIDDLE HOOK +A68C; C; A68D; # CYRILLIC CAPITAL LETTER TWE +A68E; C; A68F; # CYRILLIC CAPITAL LETTER TSWE +A690; C; A691; # CYRILLIC CAPITAL LETTER TSSE +A692; C; A693; # CYRILLIC CAPITAL LETTER TCHE +A694; C; A695; # CYRILLIC CAPITAL LETTER HWE +A696; C; A697; # CYRILLIC CAPITAL LETTER SHWE +A698; C; A699; # CYRILLIC CAPITAL LETTER DOUBLE O +A69A; C; A69B; # CYRILLIC CAPITAL LETTER CROSSED O +A722; C; A723; # LATIN CAPITAL LETTER EGYPTOLOGICAL ALEF +A724; C; A725; # LATIN CAPITAL LETTER EGYPTOLOGICAL AIN +A726; C; A727; # LATIN CAPITAL LETTER HENG +A728; C; A729; # LATIN CAPITAL LETTER TZ +A72A; C; A72B; # LATIN CAPITAL LETTER TRESILLO +A72C; C; A72D; # LATIN CAPITAL LETTER CUATRILLO +A72E; C; A72F; # LATIN CAPITAL LETTER CUATRILLO WITH COMMA +A732; C; A733; # LATIN CAPITAL LETTER AA +A734; C; A735; # LATIN CAPITAL LETTER AO +A736; C; A737; # LATIN CAPITAL LETTER AU +A738; C; A739; # LATIN CAPITAL LETTER AV +A73A; C; A73B; # LATIN CAPITAL LETTER AV WITH HORIZONTAL BAR +A73C; C; A73D; # LATIN CAPITAL LETTER AY +A73E; C; A73F; # LATIN CAPITAL LETTER REVERSED C WITH DOT +A740; C; A741; # LATIN CAPITAL LETTER K WITH STROKE +A742; C; A743; # LATIN CAPITAL LETTER K WITH DIAGONAL STROKE +A744; C; A745; # LATIN CAPITAL LETTER K WITH STROKE AND DIAGONAL STROKE +A746; C; A747; # LATIN CAPITAL LETTER BROKEN L +A748; C; A749; # LATIN CAPITAL LETTER L WITH HIGH STROKE +A74A; C; A74B; # LATIN CAPITAL LETTER O WITH LONG STROKE OVERLAY +A74C; C; A74D; # LATIN CAPITAL LETTER O WITH LOOP +A74E; C; A74F; # LATIN CAPITAL LETTER OO +A750; C; A751; # LATIN CAPITAL LETTER P WITH STROKE THROUGH DESCENDER +A752; C; A753; # LATIN CAPITAL LETTER P WITH FLOURISH +A754; C; A755; # LATIN CAPITAL LETTER P WITH SQUIRREL TAIL +A756; C; A757; # LATIN CAPITAL LETTER Q WITH STROKE THROUGH DESCENDER +A758; C; A759; # LATIN CAPITAL LETTER Q WITH DIAGONAL STROKE +A75A; C; A75B; # LATIN CAPITAL LETTER R ROTUNDA +A75C; C; A75D; # LATIN CAPITAL LETTER RUM ROTUNDA +A75E; C; A75F; # LATIN CAPITAL LETTER V WITH DIAGONAL STROKE +A760; C; A761; # LATIN CAPITAL LETTER VY +A762; C; A763; # LATIN CAPITAL LETTER VISIGOTHIC Z +A764; C; A765; # LATIN CAPITAL LETTER THORN WITH STROKE +A766; C; A767; # LATIN CAPITAL LETTER THORN WITH STROKE THROUGH DESCENDER +A768; C; A769; # LATIN CAPITAL LETTER VEND +A76A; C; A76B; # LATIN CAPITAL LETTER ET +A76C; C; A76D; # LATIN CAPITAL LETTER IS +A76E; C; A76F; # LATIN CAPITAL LETTER CON +A779; C; A77A; # LATIN CAPITAL LETTER INSULAR D +A77B; C; A77C; # LATIN CAPITAL LETTER INSULAR F +A77D; C; 1D79; # LATIN CAPITAL LETTER INSULAR G +A77E; C; A77F; # LATIN CAPITAL LETTER TURNED INSULAR G +A780; C; A781; # LATIN CAPITAL LETTER TURNED L +A782; C; A783; # LATIN CAPITAL LETTER INSULAR R +A784; C; A785; # LATIN CAPITAL LETTER INSULAR S +A786; C; A787; # LATIN CAPITAL LETTER INSULAR T +A78B; C; A78C; # LATIN CAPITAL LETTER SALTILLO +A78D; C; 0265; # LATIN CAPITAL LETTER TURNED H +A790; C; A791; # LATIN CAPITAL LETTER N WITH DESCENDER +A792; C; A793; # LATIN CAPITAL LETTER C WITH BAR +A796; C; A797; # LATIN CAPITAL LETTER B WITH FLOURISH +A798; C; A799; # LATIN CAPITAL LETTER F WITH STROKE +A79A; C; A79B; # LATIN CAPITAL LETTER VOLAPUK AE +A79C; C; A79D; # LATIN CAPITAL LETTER VOLAPUK OE +A79E; C; A79F; # LATIN CAPITAL LETTER VOLAPUK UE +A7A0; C; A7A1; # LATIN CAPITAL LETTER G WITH OBLIQUE STROKE +A7A2; C; A7A3; # LATIN CAPITAL LETTER K WITH OBLIQUE STROKE +A7A4; C; A7A5; # LATIN CAPITAL LETTER N WITH OBLIQUE STROKE +A7A6; C; A7A7; # LATIN CAPITAL LETTER R WITH OBLIQUE STROKE +A7A8; C; A7A9; # LATIN CAPITAL LETTER S WITH OBLIQUE STROKE +A7AA; C; 0266; # LATIN CAPITAL LETTER H WITH HOOK +A7AB; C; 025C; # LATIN CAPITAL LETTER REVERSED OPEN E +A7AC; C; 0261; # LATIN CAPITAL LETTER SCRIPT G +A7AD; C; 026C; # LATIN CAPITAL LETTER L WITH BELT +A7AE; C; 026A; # LATIN CAPITAL LETTER SMALL CAPITAL I +A7B0; C; 029E; # LATIN CAPITAL LETTER TURNED K +A7B1; C; 0287; # LATIN CAPITAL LETTER TURNED T +A7B2; C; 029D; # LATIN CAPITAL LETTER J WITH CROSSED-TAIL +A7B3; C; AB53; # LATIN CAPITAL LETTER CHI +A7B4; C; A7B5; # LATIN CAPITAL LETTER BETA +A7B6; C; A7B7; # LATIN CAPITAL LETTER OMEGA +A7B8; C; A7B9; # LATIN CAPITAL LETTER U WITH STROKE +A7BA; C; A7BB; # LATIN CAPITAL LETTER GLOTTAL A +A7BC; C; A7BD; # LATIN CAPITAL LETTER GLOTTAL I +A7BE; C; A7BF; # LATIN CAPITAL LETTER GLOTTAL U +A7C0; C; A7C1; # LATIN CAPITAL LETTER OLD POLISH O +A7C2; C; A7C3; # LATIN CAPITAL LETTER ANGLICANA W +A7C4; C; A794; # LATIN CAPITAL LETTER C WITH PALATAL HOOK +A7C5; C; 0282; # LATIN CAPITAL LETTER S WITH HOOK +A7C6; C; 1D8E; # LATIN CAPITAL LETTER Z WITH PALATAL HOOK +A7C7; C; A7C8; # LATIN CAPITAL LETTER D WITH SHORT STROKE OVERLAY +A7C9; C; A7CA; # LATIN CAPITAL LETTER S WITH SHORT STROKE OVERLAY +A7CB; C; 0264; # LATIN CAPITAL LETTER RAMS HORN +A7CC; C; A7CD; # LATIN CAPITAL LETTER S WITH DIAGONAL STROKE +A7D0; C; A7D1; # LATIN CAPITAL LETTER CLOSED INSULAR G +A7D6; C; A7D7; # LATIN CAPITAL LETTER MIDDLE SCOTS S +A7D8; C; A7D9; # LATIN CAPITAL LETTER SIGMOID S +A7DA; C; A7DB; # LATIN CAPITAL LETTER LAMBDA +A7DC; C; 019B; # LATIN CAPITAL LETTER LAMBDA WITH STROKE +A7F5; C; A7F6; # LATIN CAPITAL LETTER REVERSED HALF H +AB70; C; 13A0; # CHEROKEE SMALL LETTER A +AB71; C; 13A1; # CHEROKEE SMALL LETTER E +AB72; C; 13A2; # CHEROKEE SMALL LETTER I +AB73; C; 13A3; # CHEROKEE SMALL LETTER O +AB74; C; 13A4; # CHEROKEE SMALL LETTER U +AB75; C; 13A5; # CHEROKEE SMALL LETTER V +AB76; C; 13A6; # CHEROKEE SMALL LETTER GA +AB77; C; 13A7; # CHEROKEE SMALL LETTER KA +AB78; C; 13A8; # CHEROKEE SMALL LETTER GE +AB79; C; 13A9; # CHEROKEE SMALL LETTER GI +AB7A; C; 13AA; # CHEROKEE SMALL LETTER GO +AB7B; C; 13AB; # CHEROKEE SMALL LETTER GU +AB7C; C; 13AC; # CHEROKEE SMALL LETTER GV +AB7D; C; 13AD; # CHEROKEE SMALL LETTER HA +AB7E; C; 13AE; # CHEROKEE SMALL LETTER HE +AB7F; C; 13AF; # CHEROKEE SMALL LETTER HI +AB80; C; 13B0; # CHEROKEE SMALL LETTER HO +AB81; C; 13B1; # CHEROKEE SMALL LETTER HU +AB82; C; 13B2; # CHEROKEE SMALL LETTER HV +AB83; C; 13B3; # CHEROKEE SMALL LETTER LA +AB84; C; 13B4; # CHEROKEE SMALL LETTER LE +AB85; C; 13B5; # CHEROKEE SMALL LETTER LI +AB86; C; 13B6; # CHEROKEE SMALL LETTER LO +AB87; C; 13B7; # CHEROKEE SMALL LETTER LU +AB88; C; 13B8; # CHEROKEE SMALL LETTER LV +AB89; C; 13B9; # CHEROKEE SMALL LETTER MA +AB8A; C; 13BA; # CHEROKEE SMALL LETTER ME +AB8B; C; 13BB; # CHEROKEE SMALL LETTER MI +AB8C; C; 13BC; # CHEROKEE SMALL LETTER MO +AB8D; C; 13BD; # CHEROKEE SMALL LETTER MU +AB8E; C; 13BE; # CHEROKEE SMALL LETTER NA +AB8F; C; 13BF; # CHEROKEE SMALL LETTER HNA +AB90; C; 13C0; # CHEROKEE SMALL LETTER NAH +AB91; C; 13C1; # CHEROKEE SMALL LETTER NE +AB92; C; 13C2; # CHEROKEE SMALL LETTER NI +AB93; C; 13C3; # CHEROKEE SMALL LETTER NO +AB94; C; 13C4; # CHEROKEE SMALL LETTER NU +AB95; C; 13C5; # CHEROKEE SMALL LETTER NV +AB96; C; 13C6; # CHEROKEE SMALL LETTER QUA +AB97; C; 13C7; # CHEROKEE SMALL LETTER QUE +AB98; C; 13C8; # CHEROKEE SMALL LETTER QUI +AB99; C; 13C9; # CHEROKEE SMALL LETTER QUO +AB9A; C; 13CA; # CHEROKEE SMALL LETTER QUU +AB9B; C; 13CB; # CHEROKEE SMALL LETTER QUV +AB9C; C; 13CC; # CHEROKEE SMALL LETTER SA +AB9D; C; 13CD; # CHEROKEE SMALL LETTER S +AB9E; C; 13CE; # CHEROKEE SMALL LETTER SE +AB9F; C; 13CF; # CHEROKEE SMALL LETTER SI +ABA0; C; 13D0; # CHEROKEE SMALL LETTER SO +ABA1; C; 13D1; # CHEROKEE SMALL LETTER SU +ABA2; C; 13D2; # CHEROKEE SMALL LETTER SV +ABA3; C; 13D3; # CHEROKEE SMALL LETTER DA +ABA4; C; 13D4; # CHEROKEE SMALL LETTER TA +ABA5; C; 13D5; # CHEROKEE SMALL LETTER DE +ABA6; C; 13D6; # CHEROKEE SMALL LETTER TE +ABA7; C; 13D7; # CHEROKEE SMALL LETTER DI +ABA8; C; 13D8; # CHEROKEE SMALL LETTER TI +ABA9; C; 13D9; # CHEROKEE SMALL LETTER DO +ABAA; C; 13DA; # CHEROKEE SMALL LETTER DU +ABAB; C; 13DB; # CHEROKEE SMALL LETTER DV +ABAC; C; 13DC; # CHEROKEE SMALL LETTER DLA +ABAD; C; 13DD; # CHEROKEE SMALL LETTER TLA +ABAE; C; 13DE; # CHEROKEE SMALL LETTER TLE +ABAF; C; 13DF; # CHEROKEE SMALL LETTER TLI +ABB0; C; 13E0; # CHEROKEE SMALL LETTER TLO +ABB1; C; 13E1; # CHEROKEE SMALL LETTER TLU +ABB2; C; 13E2; # CHEROKEE SMALL LETTER TLV +ABB3; C; 13E3; # CHEROKEE SMALL LETTER TSA +ABB4; C; 13E4; # CHEROKEE SMALL LETTER TSE +ABB5; C; 13E5; # CHEROKEE SMALL LETTER TSI +ABB6; C; 13E6; # CHEROKEE SMALL LETTER TSO +ABB7; C; 13E7; # CHEROKEE SMALL LETTER TSU +ABB8; C; 13E8; # CHEROKEE SMALL LETTER TSV +ABB9; C; 13E9; # CHEROKEE SMALL LETTER WA +ABBA; C; 13EA; # CHEROKEE SMALL LETTER WE +ABBB; C; 13EB; # CHEROKEE SMALL LETTER WI +ABBC; C; 13EC; # CHEROKEE SMALL LETTER WO +ABBD; C; 13ED; # CHEROKEE SMALL LETTER WU +ABBE; C; 13EE; # CHEROKEE SMALL LETTER WV +ABBF; C; 13EF; # CHEROKEE SMALL LETTER YA +FB00; F; 0066 0066; # LATIN SMALL LIGATURE FF +FB01; F; 0066 0069; # LATIN SMALL LIGATURE FI +FB02; F; 0066 006C; # LATIN SMALL LIGATURE FL +FB03; F; 0066 0066 0069; # LATIN SMALL LIGATURE FFI +FB04; F; 0066 0066 006C; # LATIN SMALL LIGATURE FFL +FB05; F; 0073 0074; # LATIN SMALL LIGATURE LONG S T +FB05; S; FB06; # LATIN SMALL LIGATURE LONG S T +FB06; F; 0073 0074; # LATIN SMALL LIGATURE ST +FB13; F; 0574 0576; # ARMENIAN SMALL LIGATURE MEN NOW +FB14; F; 0574 0565; # ARMENIAN SMALL LIGATURE MEN ECH +FB15; F; 0574 056B; # ARMENIAN SMALL LIGATURE MEN INI +FB16; F; 057E 0576; # ARMENIAN SMALL LIGATURE VEW NOW +FB17; F; 0574 056D; # ARMENIAN SMALL LIGATURE MEN XEH +FF21; C; FF41; # FULLWIDTH LATIN CAPITAL LETTER A +FF22; C; FF42; # FULLWIDTH LATIN CAPITAL LETTER B +FF23; C; FF43; # FULLWIDTH LATIN CAPITAL LETTER C +FF24; C; FF44; # FULLWIDTH LATIN CAPITAL LETTER D +FF25; C; FF45; # FULLWIDTH LATIN CAPITAL LETTER E +FF26; C; FF46; # FULLWIDTH LATIN CAPITAL LETTER F +FF27; C; FF47; # FULLWIDTH LATIN CAPITAL LETTER G +FF28; C; FF48; # FULLWIDTH LATIN CAPITAL LETTER H +FF29; C; FF49; # FULLWIDTH LATIN CAPITAL LETTER I +FF2A; C; FF4A; # FULLWIDTH LATIN CAPITAL LETTER J +FF2B; C; FF4B; # FULLWIDTH LATIN CAPITAL LETTER K +FF2C; C; FF4C; # FULLWIDTH LATIN CAPITAL LETTER L +FF2D; C; FF4D; # FULLWIDTH LATIN CAPITAL LETTER M +FF2E; C; FF4E; # FULLWIDTH LATIN CAPITAL LETTER N +FF2F; C; FF4F; # FULLWIDTH LATIN CAPITAL LETTER O +FF30; C; FF50; # FULLWIDTH LATIN CAPITAL LETTER P +FF31; C; FF51; # FULLWIDTH LATIN CAPITAL LETTER Q +FF32; C; FF52; # FULLWIDTH LATIN CAPITAL LETTER R +FF33; C; FF53; # FULLWIDTH LATIN CAPITAL LETTER S +FF34; C; FF54; # FULLWIDTH LATIN CAPITAL LETTER T +FF35; C; FF55; # FULLWIDTH LATIN CAPITAL LETTER U +FF36; C; FF56; # FULLWIDTH LATIN CAPITAL LETTER V +FF37; C; FF57; # FULLWIDTH LATIN CAPITAL LETTER W +FF38; C; FF58; # FULLWIDTH LATIN CAPITAL LETTER X +FF39; C; FF59; # FULLWIDTH LATIN CAPITAL LETTER Y +FF3A; C; FF5A; # FULLWIDTH LATIN CAPITAL LETTER Z +10400; C; 10428; # DESERET CAPITAL LETTER LONG I +10401; C; 10429; # DESERET CAPITAL LETTER LONG E +10402; C; 1042A; # DESERET CAPITAL LETTER LONG A +10403; C; 1042B; # DESERET CAPITAL LETTER LONG AH +10404; C; 1042C; # DESERET CAPITAL LETTER LONG O +10405; C; 1042D; # DESERET CAPITAL LETTER LONG OO +10406; C; 1042E; # DESERET CAPITAL LETTER SHORT I +10407; C; 1042F; # DESERET CAPITAL LETTER SHORT E +10408; C; 10430; # DESERET CAPITAL LETTER SHORT A +10409; C; 10431; # DESERET CAPITAL LETTER SHORT AH +1040A; C; 10432; # DESERET CAPITAL LETTER SHORT O +1040B; C; 10433; # DESERET CAPITAL LETTER SHORT OO +1040C; C; 10434; # DESERET CAPITAL LETTER AY +1040D; C; 10435; # DESERET CAPITAL LETTER OW +1040E; C; 10436; # DESERET CAPITAL LETTER WU +1040F; C; 10437; # DESERET CAPITAL LETTER YEE +10410; C; 10438; # DESERET CAPITAL LETTER H +10411; C; 10439; # DESERET CAPITAL LETTER PEE +10412; C; 1043A; # DESERET CAPITAL LETTER BEE +10413; C; 1043B; # DESERET CAPITAL LETTER TEE +10414; C; 1043C; # DESERET CAPITAL LETTER DEE +10415; C; 1043D; # DESERET CAPITAL LETTER CHEE +10416; C; 1043E; # DESERET CAPITAL LETTER JEE +10417; C; 1043F; # DESERET CAPITAL LETTER KAY +10418; C; 10440; # DESERET CAPITAL LETTER GAY +10419; C; 10441; # DESERET CAPITAL LETTER EF +1041A; C; 10442; # DESERET CAPITAL LETTER VEE +1041B; C; 10443; # DESERET CAPITAL LETTER ETH +1041C; C; 10444; # DESERET CAPITAL LETTER THEE +1041D; C; 10445; # DESERET CAPITAL LETTER ES +1041E; C; 10446; # DESERET CAPITAL LETTER ZEE +1041F; C; 10447; # DESERET CAPITAL LETTER ESH +10420; C; 10448; # DESERET CAPITAL LETTER ZHEE +10421; C; 10449; # DESERET CAPITAL LETTER ER +10422; C; 1044A; # DESERET CAPITAL LETTER EL +10423; C; 1044B; # DESERET CAPITAL LETTER EM +10424; C; 1044C; # DESERET CAPITAL LETTER EN +10425; C; 1044D; # DESERET CAPITAL LETTER ENG +10426; C; 1044E; # DESERET CAPITAL LETTER OI +10427; C; 1044F; # DESERET CAPITAL LETTER EW +104B0; C; 104D8; # OSAGE CAPITAL LETTER A +104B1; C; 104D9; # OSAGE CAPITAL LETTER AI +104B2; C; 104DA; # OSAGE CAPITAL LETTER AIN +104B3; C; 104DB; # OSAGE CAPITAL LETTER AH +104B4; C; 104DC; # OSAGE CAPITAL LETTER BRA +104B5; C; 104DD; # OSAGE CAPITAL LETTER CHA +104B6; C; 104DE; # OSAGE CAPITAL LETTER EHCHA +104B7; C; 104DF; # OSAGE CAPITAL LETTER E +104B8; C; 104E0; # OSAGE CAPITAL LETTER EIN +104B9; C; 104E1; # OSAGE CAPITAL LETTER HA +104BA; C; 104E2; # OSAGE CAPITAL LETTER HYA +104BB; C; 104E3; # OSAGE CAPITAL LETTER I +104BC; C; 104E4; # OSAGE CAPITAL LETTER KA +104BD; C; 104E5; # OSAGE CAPITAL LETTER EHKA +104BE; C; 104E6; # OSAGE CAPITAL LETTER KYA +104BF; C; 104E7; # OSAGE CAPITAL LETTER LA +104C0; C; 104E8; # OSAGE CAPITAL LETTER MA +104C1; C; 104E9; # OSAGE CAPITAL LETTER NA +104C2; C; 104EA; # OSAGE CAPITAL LETTER O +104C3; C; 104EB; # OSAGE CAPITAL LETTER OIN +104C4; C; 104EC; # OSAGE CAPITAL LETTER PA +104C5; C; 104ED; # OSAGE CAPITAL LETTER EHPA +104C6; C; 104EE; # OSAGE CAPITAL LETTER SA +104C7; C; 104EF; # OSAGE CAPITAL LETTER SHA +104C8; C; 104F0; # OSAGE CAPITAL LETTER TA +104C9; C; 104F1; # OSAGE CAPITAL LETTER EHTA +104CA; C; 104F2; # OSAGE CAPITAL LETTER TSA +104CB; C; 104F3; # OSAGE CAPITAL LETTER EHTSA +104CC; C; 104F4; # OSAGE CAPITAL LETTER TSHA +104CD; C; 104F5; # OSAGE CAPITAL LETTER DHA +104CE; C; 104F6; # OSAGE CAPITAL LETTER U +104CF; C; 104F7; # OSAGE CAPITAL LETTER WA +104D0; C; 104F8; # OSAGE CAPITAL LETTER KHA +104D1; C; 104F9; # OSAGE CAPITAL LETTER GHA +104D2; C; 104FA; # OSAGE CAPITAL LETTER ZA +104D3; C; 104FB; # OSAGE CAPITAL LETTER ZHA +10570; C; 10597; # VITHKUQI CAPITAL LETTER A +10571; C; 10598; # VITHKUQI CAPITAL LETTER BBE +10572; C; 10599; # VITHKUQI CAPITAL LETTER BE +10573; C; 1059A; # VITHKUQI CAPITAL LETTER CE +10574; C; 1059B; # VITHKUQI CAPITAL LETTER CHE +10575; C; 1059C; # VITHKUQI CAPITAL LETTER DE +10576; C; 1059D; # VITHKUQI CAPITAL LETTER DHE +10577; C; 1059E; # VITHKUQI CAPITAL LETTER EI +10578; C; 1059F; # VITHKUQI CAPITAL LETTER E +10579; C; 105A0; # VITHKUQI CAPITAL LETTER FE +1057A; C; 105A1; # VITHKUQI CAPITAL LETTER GA +1057C; C; 105A3; # VITHKUQI CAPITAL LETTER HA +1057D; C; 105A4; # VITHKUQI CAPITAL LETTER HHA +1057E; C; 105A5; # VITHKUQI CAPITAL LETTER I +1057F; C; 105A6; # VITHKUQI CAPITAL LETTER IJE +10580; C; 105A7; # VITHKUQI CAPITAL LETTER JE +10581; C; 105A8; # VITHKUQI CAPITAL LETTER KA +10582; C; 105A9; # VITHKUQI CAPITAL LETTER LA +10583; C; 105AA; # VITHKUQI CAPITAL LETTER LLA +10584; C; 105AB; # VITHKUQI CAPITAL LETTER ME +10585; C; 105AC; # VITHKUQI CAPITAL LETTER NE +10586; C; 105AD; # VITHKUQI CAPITAL LETTER NJE +10587; C; 105AE; # VITHKUQI CAPITAL LETTER O +10588; C; 105AF; # VITHKUQI CAPITAL LETTER PE +10589; C; 105B0; # VITHKUQI CAPITAL LETTER QA +1058A; C; 105B1; # VITHKUQI CAPITAL LETTER RE +1058C; C; 105B3; # VITHKUQI CAPITAL LETTER SE +1058D; C; 105B4; # VITHKUQI CAPITAL LETTER SHE +1058E; C; 105B5; # VITHKUQI CAPITAL LETTER TE +1058F; C; 105B6; # VITHKUQI CAPITAL LETTER THE +10590; C; 105B7; # VITHKUQI CAPITAL LETTER U +10591; C; 105B8; # VITHKUQI CAPITAL LETTER VE +10592; C; 105B9; # VITHKUQI CAPITAL LETTER XE +10594; C; 105BB; # VITHKUQI CAPITAL LETTER Y +10595; C; 105BC; # VITHKUQI CAPITAL LETTER ZE +10C80; C; 10CC0; # OLD HUNGARIAN CAPITAL LETTER A +10C81; C; 10CC1; # OLD HUNGARIAN CAPITAL LETTER AA +10C82; C; 10CC2; # OLD HUNGARIAN CAPITAL LETTER EB +10C83; C; 10CC3; # OLD HUNGARIAN CAPITAL LETTER AMB +10C84; C; 10CC4; # OLD HUNGARIAN CAPITAL LETTER EC +10C85; C; 10CC5; # OLD HUNGARIAN CAPITAL LETTER ENC +10C86; C; 10CC6; # OLD HUNGARIAN CAPITAL LETTER ECS +10C87; C; 10CC7; # OLD HUNGARIAN CAPITAL LETTER ED +10C88; C; 10CC8; # OLD HUNGARIAN CAPITAL LETTER AND +10C89; C; 10CC9; # OLD HUNGARIAN CAPITAL LETTER E +10C8A; C; 10CCA; # OLD HUNGARIAN CAPITAL LETTER CLOSE E +10C8B; C; 10CCB; # OLD HUNGARIAN CAPITAL LETTER EE +10C8C; C; 10CCC; # OLD HUNGARIAN CAPITAL LETTER EF +10C8D; C; 10CCD; # OLD HUNGARIAN CAPITAL LETTER EG +10C8E; C; 10CCE; # OLD HUNGARIAN CAPITAL LETTER EGY +10C8F; C; 10CCF; # OLD HUNGARIAN CAPITAL LETTER EH +10C90; C; 10CD0; # OLD HUNGARIAN CAPITAL LETTER I +10C91; C; 10CD1; # OLD HUNGARIAN CAPITAL LETTER II +10C92; C; 10CD2; # OLD HUNGARIAN CAPITAL LETTER EJ +10C93; C; 10CD3; # OLD HUNGARIAN CAPITAL LETTER EK +10C94; C; 10CD4; # OLD HUNGARIAN CAPITAL LETTER AK +10C95; C; 10CD5; # OLD HUNGARIAN CAPITAL LETTER UNK +10C96; C; 10CD6; # OLD HUNGARIAN CAPITAL LETTER EL +10C97; C; 10CD7; # OLD HUNGARIAN CAPITAL LETTER ELY +10C98; C; 10CD8; # OLD HUNGARIAN CAPITAL LETTER EM +10C99; C; 10CD9; # OLD HUNGARIAN CAPITAL LETTER EN +10C9A; C; 10CDA; # OLD HUNGARIAN CAPITAL LETTER ENY +10C9B; C; 10CDB; # OLD HUNGARIAN CAPITAL LETTER O +10C9C; C; 10CDC; # OLD HUNGARIAN CAPITAL LETTER OO +10C9D; C; 10CDD; # OLD HUNGARIAN CAPITAL LETTER NIKOLSBURG OE +10C9E; C; 10CDE; # OLD HUNGARIAN CAPITAL LETTER RUDIMENTA OE +10C9F; C; 10CDF; # OLD HUNGARIAN CAPITAL LETTER OEE +10CA0; C; 10CE0; # OLD HUNGARIAN CAPITAL LETTER EP +10CA1; C; 10CE1; # OLD HUNGARIAN CAPITAL LETTER EMP +10CA2; C; 10CE2; # OLD HUNGARIAN CAPITAL LETTER ER +10CA3; C; 10CE3; # OLD HUNGARIAN CAPITAL LETTER SHORT ER +10CA4; C; 10CE4; # OLD HUNGARIAN CAPITAL LETTER ES +10CA5; C; 10CE5; # OLD HUNGARIAN CAPITAL LETTER ESZ +10CA6; C; 10CE6; # OLD HUNGARIAN CAPITAL LETTER ET +10CA7; C; 10CE7; # OLD HUNGARIAN CAPITAL LETTER ENT +10CA8; C; 10CE8; # OLD HUNGARIAN CAPITAL LETTER ETY +10CA9; C; 10CE9; # OLD HUNGARIAN CAPITAL LETTER ECH +10CAA; C; 10CEA; # OLD HUNGARIAN CAPITAL LETTER U +10CAB; C; 10CEB; # OLD HUNGARIAN CAPITAL LETTER UU +10CAC; C; 10CEC; # OLD HUNGARIAN CAPITAL LETTER NIKOLSBURG UE +10CAD; C; 10CED; # OLD HUNGARIAN CAPITAL LETTER RUDIMENTA UE +10CAE; C; 10CEE; # OLD HUNGARIAN CAPITAL LETTER EV +10CAF; C; 10CEF; # OLD HUNGARIAN CAPITAL LETTER EZ +10CB0; C; 10CF0; # OLD HUNGARIAN CAPITAL LETTER EZS +10CB1; C; 10CF1; # OLD HUNGARIAN CAPITAL LETTER ENT-SHAPED SIGN +10CB2; C; 10CF2; # OLD HUNGARIAN CAPITAL LETTER US +10D50; C; 10D70; # GARAY CAPITAL LETTER A +10D51; C; 10D71; # GARAY CAPITAL LETTER CA +10D52; C; 10D72; # GARAY CAPITAL LETTER MA +10D53; C; 10D73; # GARAY CAPITAL LETTER KA +10D54; C; 10D74; # GARAY CAPITAL LETTER BA +10D55; C; 10D75; # GARAY CAPITAL LETTER JA +10D56; C; 10D76; # GARAY CAPITAL LETTER SA +10D57; C; 10D77; # GARAY CAPITAL LETTER WA +10D58; C; 10D78; # GARAY CAPITAL LETTER LA +10D59; C; 10D79; # GARAY CAPITAL LETTER GA +10D5A; C; 10D7A; # GARAY CAPITAL LETTER DA +10D5B; C; 10D7B; # GARAY CAPITAL LETTER XA +10D5C; C; 10D7C; # GARAY CAPITAL LETTER YA +10D5D; C; 10D7D; # GARAY CAPITAL LETTER TA +10D5E; C; 10D7E; # GARAY CAPITAL LETTER RA +10D5F; C; 10D7F; # GARAY CAPITAL LETTER NYA +10D60; C; 10D80; # GARAY CAPITAL LETTER FA +10D61; C; 10D81; # GARAY CAPITAL LETTER NA +10D62; C; 10D82; # GARAY CAPITAL LETTER PA +10D63; C; 10D83; # GARAY CAPITAL LETTER HA +10D64; C; 10D84; # GARAY CAPITAL LETTER OLD KA +10D65; C; 10D85; # GARAY CAPITAL LETTER OLD NA +118A0; C; 118C0; # WARANG CITI CAPITAL LETTER NGAA +118A1; C; 118C1; # WARANG CITI CAPITAL LETTER A +118A2; C; 118C2; # WARANG CITI CAPITAL LETTER WI +118A3; C; 118C3; # WARANG CITI CAPITAL LETTER YU +118A4; C; 118C4; # WARANG CITI CAPITAL LETTER YA +118A5; C; 118C5; # WARANG CITI CAPITAL LETTER YO +118A6; C; 118C6; # WARANG CITI CAPITAL LETTER II +118A7; C; 118C7; # WARANG CITI CAPITAL LETTER UU +118A8; C; 118C8; # WARANG CITI CAPITAL LETTER E +118A9; C; 118C9; # WARANG CITI CAPITAL LETTER O +118AA; C; 118CA; # WARANG CITI CAPITAL LETTER ANG +118AB; C; 118CB; # WARANG CITI CAPITAL LETTER GA +118AC; C; 118CC; # WARANG CITI CAPITAL LETTER KO +118AD; C; 118CD; # WARANG CITI CAPITAL LETTER ENY +118AE; C; 118CE; # WARANG CITI CAPITAL LETTER YUJ +118AF; C; 118CF; # WARANG CITI CAPITAL LETTER UC +118B0; C; 118D0; # WARANG CITI CAPITAL LETTER ENN +118B1; C; 118D1; # WARANG CITI CAPITAL LETTER ODD +118B2; C; 118D2; # WARANG CITI CAPITAL LETTER TTE +118B3; C; 118D3; # WARANG CITI CAPITAL LETTER NUNG +118B4; C; 118D4; # WARANG CITI CAPITAL LETTER DA +118B5; C; 118D5; # WARANG CITI CAPITAL LETTER AT +118B6; C; 118D6; # WARANG CITI CAPITAL LETTER AM +118B7; C; 118D7; # WARANG CITI CAPITAL LETTER BU +118B8; C; 118D8; # WARANG CITI CAPITAL LETTER PU +118B9; C; 118D9; # WARANG CITI CAPITAL LETTER HIYO +118BA; C; 118DA; # WARANG CITI CAPITAL LETTER HOLO +118BB; C; 118DB; # WARANG CITI CAPITAL LETTER HORR +118BC; C; 118DC; # WARANG CITI CAPITAL LETTER HAR +118BD; C; 118DD; # WARANG CITI CAPITAL LETTER SSUU +118BE; C; 118DE; # WARANG CITI CAPITAL LETTER SII +118BF; C; 118DF; # WARANG CITI CAPITAL LETTER VIYO +16E40; C; 16E60; # MEDEFAIDRIN CAPITAL LETTER M +16E41; C; 16E61; # MEDEFAIDRIN CAPITAL LETTER S +16E42; C; 16E62; # MEDEFAIDRIN CAPITAL LETTER V +16E43; C; 16E63; # MEDEFAIDRIN CAPITAL LETTER W +16E44; C; 16E64; # MEDEFAIDRIN CAPITAL LETTER ATIU +16E45; C; 16E65; # MEDEFAIDRIN CAPITAL LETTER Z +16E46; C; 16E66; # MEDEFAIDRIN CAPITAL LETTER KP +16E47; C; 16E67; # MEDEFAIDRIN CAPITAL LETTER P +16E48; C; 16E68; # MEDEFAIDRIN CAPITAL LETTER T +16E49; C; 16E69; # MEDEFAIDRIN CAPITAL LETTER G +16E4A; C; 16E6A; # MEDEFAIDRIN CAPITAL LETTER F +16E4B; C; 16E6B; # MEDEFAIDRIN CAPITAL LETTER I +16E4C; C; 16E6C; # MEDEFAIDRIN CAPITAL LETTER K +16E4D; C; 16E6D; # MEDEFAIDRIN CAPITAL LETTER A +16E4E; C; 16E6E; # MEDEFAIDRIN CAPITAL LETTER J +16E4F; C; 16E6F; # MEDEFAIDRIN CAPITAL LETTER E +16E50; C; 16E70; # MEDEFAIDRIN CAPITAL LETTER B +16E51; C; 16E71; # MEDEFAIDRIN CAPITAL LETTER C +16E52; C; 16E72; # MEDEFAIDRIN CAPITAL LETTER U +16E53; C; 16E73; # MEDEFAIDRIN CAPITAL LETTER YU +16E54; C; 16E74; # MEDEFAIDRIN CAPITAL LETTER L +16E55; C; 16E75; # MEDEFAIDRIN CAPITAL LETTER Q +16E56; C; 16E76; # MEDEFAIDRIN CAPITAL LETTER HP +16E57; C; 16E77; # MEDEFAIDRIN CAPITAL LETTER NY +16E58; C; 16E78; # MEDEFAIDRIN CAPITAL LETTER X +16E59; C; 16E79; # MEDEFAIDRIN CAPITAL LETTER D +16E5A; C; 16E7A; # MEDEFAIDRIN CAPITAL LETTER OE +16E5B; C; 16E7B; # MEDEFAIDRIN CAPITAL LETTER N +16E5C; C; 16E7C; # MEDEFAIDRIN CAPITAL LETTER R +16E5D; C; 16E7D; # MEDEFAIDRIN CAPITAL LETTER O +16E5E; C; 16E7E; # MEDEFAIDRIN CAPITAL LETTER AI +16E5F; C; 16E7F; # MEDEFAIDRIN CAPITAL LETTER Y +1E900; C; 1E922; # ADLAM CAPITAL LETTER ALIF +1E901; C; 1E923; # ADLAM CAPITAL LETTER DAALI +1E902; C; 1E924; # ADLAM CAPITAL LETTER LAAM +1E903; C; 1E925; # ADLAM CAPITAL LETTER MIIM +1E904; C; 1E926; # ADLAM CAPITAL LETTER BA +1E905; C; 1E927; # ADLAM CAPITAL LETTER SINNYIIYHE +1E906; C; 1E928; # ADLAM CAPITAL LETTER PE +1E907; C; 1E929; # ADLAM CAPITAL LETTER BHE +1E908; C; 1E92A; # ADLAM CAPITAL LETTER RA +1E909; C; 1E92B; # ADLAM CAPITAL LETTER E +1E90A; C; 1E92C; # ADLAM CAPITAL LETTER FA +1E90B; C; 1E92D; # ADLAM CAPITAL LETTER I +1E90C; C; 1E92E; # ADLAM CAPITAL LETTER O +1E90D; C; 1E92F; # ADLAM CAPITAL LETTER DHA +1E90E; C; 1E930; # ADLAM CAPITAL LETTER YHE +1E90F; C; 1E931; # ADLAM CAPITAL LETTER WAW +1E910; C; 1E932; # ADLAM CAPITAL LETTER NUN +1E911; C; 1E933; # ADLAM CAPITAL LETTER KAF +1E912; C; 1E934; # ADLAM CAPITAL LETTER YA +1E913; C; 1E935; # ADLAM CAPITAL LETTER U +1E914; C; 1E936; # ADLAM CAPITAL LETTER JIIM +1E915; C; 1E937; # ADLAM CAPITAL LETTER CHI +1E916; C; 1E938; # ADLAM CAPITAL LETTER HA +1E917; C; 1E939; # ADLAM CAPITAL LETTER QAAF +1E918; C; 1E93A; # ADLAM CAPITAL LETTER GA +1E919; C; 1E93B; # ADLAM CAPITAL LETTER NYA +1E91A; C; 1E93C; # ADLAM CAPITAL LETTER TU +1E91B; C; 1E93D; # ADLAM CAPITAL LETTER NHA +1E91C; C; 1E93E; # ADLAM CAPITAL LETTER VA +1E91D; C; 1E93F; # ADLAM CAPITAL LETTER KHA +1E91E; C; 1E940; # ADLAM CAPITAL LETTER GBE +1E91F; C; 1E941; # ADLAM CAPITAL LETTER ZAL +1E920; C; 1E942; # ADLAM CAPITAL LETTER KPO +1E921; C; 1E943; # ADLAM CAPITAL LETTER SHA +# +# EOF diff --git a/dune-project b/dune-project index 971ae457..063e8719 100644 --- a/dune-project +++ b/dune-project @@ -22,6 +22,9 @@ (synopsis "RE is a regular expression library for OCaml") (depends (ocaml (>= 4.08.0)) + uucp + (uucd :build) + (zip :build) (ppx_expect :with-test) (ounit2 :with-test) (js_of_ocaml :with-test)) diff --git a/lib/unicode/ast.ml b/lib/unicode/ast.ml new file mode 100644 index 00000000..a98a508d --- /dev/null +++ b/lib/unicode/ast.ml @@ -0,0 +1,500 @@ +open Import + +type ('a, _) ast = + | Alternative : 'a list -> ('a, [> `Uncased ]) ast + | No_case : 'a -> ('a, [> `Cased ]) ast + | Case : 'a -> ('a, [> `Cased ]) ast + +let dyn_of_ast f = + let open Dyn in + function + | Alternative xs -> variant "Alternative" (List.map xs ~f) + | No_case a -> variant "No_case" [ f a ] + | Case a -> variant "Case" [ f a ] + +let empty_alternative : ('a, 'b) ast = Alternative [] + +let equal_ast (type a) eq (x : (a, [ `Uncased ]) ast) + (y : (a, [ `Uncased ]) ast) = + match (x, y) with Alternative a, Alternative b -> List.equal ~eq a b + +let pp_ast (type a b) f fmt (ast : (a, b) ast) = + let open Fmt in + let var s re = sexp fmt s f re in + match ast with + | Alternative alt -> sexp fmt "Alternative" (list f) alt + | Case c -> var "Case" c + | No_case c -> var "No_case" c + +type ('a, 'case) gen = + | Set of 'a + | Ast of (('a, 'case) gen, 'case) ast + | Sequence of ('a, 'case) gen list + | Repeat of ('a, 'case) gen * int * int option + | Beg_of_line + | End_of_line + | Beg_of_word + | End_of_word + | Not_bound + | Beg_of_str + | End_of_str + | Last_end_of_line + | Start + | Stop + | Group of string option * ('a, 'case) gen + | No_group of ('a, 'case) gen + | Nest of ('a, 'case) gen + | Pmark of Pmark.t * ('a, 'case) gen + | Sem of Automata.Sem.t * ('a, 'case) gen + | Sem_greedy of Automata.Rep_kind.t * ('a, 'case) gen + +let rec dyn_of_gen f = + let open Dyn in + function + | Set a -> variant "Set" [ f a ] + | Ast ast -> variant "Ast" [ dyn_of_ast (dyn_of_gen f) ast ] + | Sequence xs -> variant "Sequence" (List.map xs ~f:(dyn_of_gen f)) + | Repeat (gen, min, max) -> + let base = match max with None -> [] | Some x -> [ int x ] in + variant "Repeat" (dyn_of_gen f gen :: int min :: base) + | Beg_of_line -> enum "Beg_of_line" + | End_of_line -> enum "End_of_line" + | Beg_of_word -> enum "Beg_of_word" + | End_of_word -> enum "End_of_word" + | Not_bound -> enum "Not_bound" + | Beg_of_str -> enum "Beg_of_str" + | End_of_str -> enum "End_of_str" + | Last_end_of_line -> enum "Last_end_of_line" + | Start -> enum "Start" + | Stop -> enum "Stop" + | Group (name, t) -> + let args = + let args = [ dyn_of_gen f t ] in + match name with None -> args | Some name -> string name :: args + in + variant "Group" args + | No_group x -> variant "No_group" [ dyn_of_gen f x ] + | Nest x -> variant "Nest" [ dyn_of_gen f x ] + | Pmark (pmark, t) -> variant "Pmark" [ Pmark.to_dyn pmark; dyn_of_gen f t ] + | Sem (sem, t) -> variant "Sem" [ Automata.Sem.to_dyn sem; dyn_of_gen f t ] + | Sem_greedy (rep, t) -> + variant "Sem_greedy" [ Automata.Rep_kind.to_dyn rep; dyn_of_gen f t ] + +let rec pp_gen pp_cset fmt t = + let open Format in + let open Fmt in + let pp = pp_gen pp_cset in + let var s re = sexp fmt s pp re in + let seq s rel = sexp fmt s (list pp) rel in + match t with + | Set cset -> pp_cset fmt cset + | Sequence sq -> seq "Sequence" sq + | Repeat (re, start, stop) -> + let pp' fmt () = fprintf fmt "%a@ %d%a" pp re start optint stop in + sexp fmt "Repeat" pp' () + | Beg_of_line -> str fmt "Beg_of_line" + | End_of_line -> str fmt "End_of_line" + | Beg_of_word -> str fmt "Beg_of_word" + | End_of_word -> str fmt "End_of_word" + | Not_bound -> str fmt "Not_bound" + | Beg_of_str -> str fmt "Beg_of_str" + | End_of_str -> str fmt "End_of_str" + | Last_end_of_line -> str fmt "Last_end_of_line" + | Start -> str fmt "Start" + | Stop -> str fmt "Stop" + | Group (None, c) -> var "Group" c + | Group (Some n, c) -> sexp fmt "Named_group" (pair str pp) (n, c) + | Nest c -> var "Nest" c + | Pmark (m, r) -> sexp fmt "Pmark" (pair Pmark.pp pp) (m, r) + | Ast a -> pp_ast pp fmt a + | Sem (sem, a) -> sexp fmt "Sem" (pair Automata.Sem.pp pp) (sem, a) + | Sem_greedy (k, re) -> + sexp fmt "Sem_greedy" (pair Automata.Rep_kind.pp pp) (k, re) + | No_group c -> var "No_group" c + +let rec equal cset x1 x2 = + match (x1, x2) with + | Set s1, Set s2 -> cset s1 s2 + | Sequence l1, Sequence l2 -> List.equal ~eq:(equal cset) l1 l2 + | Repeat (x1', i1, j1), Repeat (x2', i2, j2) -> + Int.equal i1 i2 && Option.equal Int.equal j1 j2 && equal cset x1' x2' + | Beg_of_line, Beg_of_line + | End_of_line, End_of_line + | Beg_of_word, Beg_of_word + | End_of_word, End_of_word + | Not_bound, Not_bound + | Beg_of_str, Beg_of_str + | End_of_str, End_of_str + | Last_end_of_line, Last_end_of_line + | Start, Start + | Stop, Stop -> + true + | Group _, Group _ -> + (* Do not merge groups! *) + false + | Pmark (m1, r1), Pmark (m2, r2) -> Pmark.equal m1 m2 && equal cset r1 r2 + | Nest x, Nest y -> equal cset x y + | Ast x, Ast y -> equal_ast (equal cset) x y + | Sem (sem, a), Sem (sem', a') -> Poly.equal sem sem' && equal cset a a' + | Sem_greedy (rep, a), Sem_greedy (rep', a') -> + Poly.equal rep rep' && equal cset a a' + | _ -> false + +module type Export = sig + type letter + type nonrec t + + val empty : t + val epsilon : t + val str : string -> t + val char : char -> letter + val int : int -> letter + val no_case : t -> t + val case : t -> t + val diff : t -> t -> t + val compl : t list -> t + val repn : t -> int -> int option -> t + val inter : t list -> t + val letter : letter -> t + val any : t + val set : string -> t + val mark : t -> Pmark.t * t + val nest : t -> t + val no_group : t -> t + val whole_string : t -> t + val leol : t + val longest : t -> t + val greedy : t -> t + val non_greedy : t -> t + val stop : t + val not_boundary : t + val group : ?name:string -> t -> t + val word : t -> t + val first : t -> t + val bos : t + val bow : t + val eow : t + val eos : t + val bol : t + val start : t + val eol : t + val opt : t -> t + val rep : t -> t + val rep1 : t -> t + val alt : t list -> t + val shortest : t -> t + val seq : t list -> t + val pp : t Fmt.t + val witness : t -> string +end + +module type T = sig + type letter + type cset_t + type color_map_t + + type cset = + | Cset of cset_t + | Intersection of cset list + | Complement of cset list + | Difference of cset * cset + | Cast of (cset, [ `Cased | `Uncased ]) ast + + type t = (cset, [ `Cased | `Uncased ]) gen + type no_case = (cset_t, [ `Uncased ]) gen + + val to_dyn : t -> Dyn.t + val pp : t Fmt.t + + val merge_sequences : + (cset_t, [ `Uncased ]) gen list -> (cset_t, [ `Uncased ]) gen list + + val handle_case : bool -> t -> (cset_t, [ `Uncased ]) gen + val anchored : t -> bool + val colorize : color_map_t -> (cset_t, [ `Uncased ]) gen -> bool + val cset : cset_t -> t + val t_of_cset : cset -> t + + module Export : Export with type t = t and type letter = letter +end + +module Make + (Cset : Cset.T) + (Color_map : + Color_map.T + with type cp = Cset.cp + and type letter = Cset.letter + and type cset_t = Cset.t) = +struct + type letter = Cset.letter + type cset_t = Cset.t + type color_map_t = Color_map.t + + type cset = + | Cset of cset_t + | Intersection of cset list + | Complement of cset list + | Difference of cset * cset + | Cast of (cset, [ `Cased | `Uncased ]) ast + + let rec dyn_of_cset = + let open Dyn in + function + | Cset cset -> variant "Cset" [ Cset.to_dyn cset ] + | Intersection xs -> variant "Intersection" (List.map xs ~f:dyn_of_cset) + | Complement xs -> variant "Complement" (List.map xs ~f:dyn_of_cset) + | Difference (x, y) -> variant "Difference" [ dyn_of_cset x; dyn_of_cset y ] + | Cast c -> variant "Cast" [ dyn_of_ast dyn_of_cset c ] + + let rec pp_cset fmt cset = + let open Fmt in + let seq s rel = sexp fmt s (list pp_cset) rel in + match cset with + | Cast s -> pp_ast pp_cset fmt s + | Cset s -> sexp fmt "Set" Cset.pp s + | Intersection c -> seq "Intersection" c + | Complement c -> seq "Complement" c + | Difference (a, b) -> sexp fmt "Difference" (pair pp_cset pp_cset) (a, b) + + type t = (cset, [ `Cased | `Uncased ]) gen + type no_case = (cset_t, [ `Uncased ]) gen + + let to_dyn = dyn_of_gen dyn_of_cset + let pp = pp_gen pp_cset + + (* let cset cset = Set (Cset cset) *) + let cset cset = Set (Cset cset) + + let rec handle_case_cset ign_case = function + | Cset s -> if ign_case then Cset.case_insens s else s + | Cast (Alternative l) -> + List.map ~f:(handle_case_cset ign_case) l |> Cset.union_all + | Complement l -> + List.map ~f:(handle_case_cset ign_case) l + |> Cset.union_all |> Cset.diff Cset.cany + | Difference (r, r') -> + Cset.inter + (handle_case_cset ign_case r) + (Cset.diff Cset.cany (handle_case_cset ign_case r')) + | Intersection l -> + List.map ~f:(handle_case_cset ign_case) l |> Cset.intersect_all + | Cast (No_case a) -> handle_case_cset true a + | Cast (Case a) -> handle_case_cset false a + + let rec handle_case ign_case : t -> (Cset.t, [ `Uncased ]) gen = + fun t -> + match t with + | Set s -> Set (handle_case_cset ign_case s) + | Sequence l -> Sequence (List.map ~f:(handle_case ign_case) l) + | Ast (Alternative l) -> + let l = List.map ~f:(handle_case ign_case) l in + Ast (Alternative l) + | Repeat (r, i, j) -> Repeat (handle_case ign_case r, i, j) + | ( Beg_of_line | End_of_line | Beg_of_word | End_of_word | Not_bound + | Beg_of_str | End_of_str | Last_end_of_line | Start | Stop ) as r -> + r + | Sem (k, r) -> Sem (k, handle_case ign_case r) + | Sem_greedy (k, r) -> Sem_greedy (k, handle_case ign_case r) + | Group (n, r) -> Group (n, handle_case ign_case r) + | No_group r -> No_group (handle_case ign_case r) + | Nest r -> Nest (handle_case ign_case r) + | Ast (Case r) -> handle_case false r + | Ast (No_case r) -> handle_case true r + | Pmark (i, r) -> Pmark (i, handle_case ign_case r) + + module Export = struct + type nonrec t = t + type nonrec letter = letter + + let pp = pp + let seq : t list -> t = function [ r ] -> r | l -> Sequence l + let epsilon : t = seq [] + let letter l = cset (Cset.csingle l) + (* let f = + Dense_map.make ~size:256 ~f:(fun i -> + cset + (Cset.csingle (Cset.CodePage.of_int i |> Cset.CodePage.to_letter))) + in + fun c -> f (Cset.CodePage.from_letter c |> Cset.CodePage.to_int) *) + + let any = cset Cset.cany + let char c = Cset.CodePage.of_char c + let int n = Cset.CodePage.to_letter @@ Cset.CodePage.of_int n + + (* let pp_str ppf l = Format.pp_print_iter + ~pp_sep:(fun fmt () -> Format.fprintf fmt ",@ ") + Stdlib.List.iter + (fun fmt letter -> Format.fprintf fmt "%a" Cset.Codec.pp letter) + ppf l *) + + let str s : t = + Cset.Codec.to_list s + |> fun l -> + List.map l ~f:letter + |> seq + + let as_set_elems elems = + match + List.map elems ~f:(function Set e -> e | _ -> raise_notrace Exit) + with + | exception Exit -> None + | e -> Some e + + let empty : t = Ast empty_alternative + + let alt (elems : t list) : t = + match elems with + | [] -> empty + | [ x ] -> x + | _ -> ( + match as_set_elems elems with + | None -> Ast (Alternative elems) + | Some elems -> Set (Cast (Alternative elems))) + + let repn r i j = + if i < 0 then invalid_arg "Re.repn"; + match (j, i) with + | Some j, _ when j < i -> invalid_arg "Re.repn" + | Some 0, 0 -> epsilon + | Some 1, 1 -> r + | _ -> Repeat (r, i, j) + + let rep r = repn r 0 None + let rep1 r = repn r 1 None + let opt r = repn r 0 (Some 1) + let bol = Beg_of_line + let eol = End_of_line + let bow = Beg_of_word + let eow = End_of_word + let word r = seq [ bow; r; eow ] + let not_boundary = Not_bound + let bos = Beg_of_str + let eos = End_of_str + let whole_string r = seq [ bos; r; eos ] + let leol = Last_end_of_line + let start = Start + let stop = Stop + + type 'b f = { f : 'a. 'a -> ('a, 'b) ast } + + let make_set f t = + match t with Set x -> Set (Cast (f.f x)) | _ -> Ast (f.f t) + + let preserve_set f : t -> t = fun t -> match t with Set _ -> t | _ -> f t + let longest = preserve_set (fun t -> Sem (`Longest, t)) + let shortest = preserve_set (fun t -> Sem (`Shortest, t)) + let first = preserve_set (fun t -> Sem (`First, t)) + let greedy = preserve_set (fun t -> Sem_greedy (`Greedy, t)) + let non_greedy = preserve_set (fun t -> Sem_greedy (`Non_greedy, t)) + let group ?name r = Group (name, r) + let no_group = preserve_set (fun t -> No_group t) + let nest r = Nest r + let set str = cset (Cset.set str) + + let mark r = + let i = Pmark.gen () in + (i, Pmark (i, r)) + + (**** Character sets ****) + let as_set_or_error name elems = + match as_set_elems elems with None -> invalid_arg name | Some s -> s + + let inter elems = Set (Intersection (as_set_or_error "Re.inter" elems)) + let compl elems = Set (Complement (as_set_or_error "Re.compl" elems)) + + let diff r r' = + match (r, r') with + | Set r, Set r' -> Set (Difference (r, r')) + | _, _ -> invalid_arg "Re.diff" + + let case = + let f = { f = (fun r -> Case r) } in + fun t -> make_set f t + + let no_case = + let f = { f = (fun r -> No_case r) } in + fun t -> make_set f t + + let witness t = + let rec witness (t : no_case) = + match t with + | Set c -> + Cset.CodePage.to_letter (Cset.pick c) + |> Cset.Codec.to_bytes |> Bytes.unsafe_to_string + | Sequence xs -> String.concat "" (List.map ~f:witness xs) + | Ast (Alternative (x :: _)) -> witness x + | Ast (Alternative []) -> assert false + | Repeat (r, from, _to) -> + let w = witness r in + let b = Buffer.create (String.length w * from) in + for _i = 1 to from do + Buffer.add_string b w + done; + Buffer.contents b + | No_group r -> witness r + | Sem_greedy (_, r) | Sem (_, r) | Nest r | Pmark (_, r) | Group (_, r) + -> + witness r + | Beg_of_line | End_of_line | Beg_of_word | End_of_word | Not_bound + | Beg_of_str | Last_end_of_line | Start | Stop | End_of_str -> + "" + in + witness (handle_case false t) + end + + (* open Export *) + let seq = function [ r ] -> r | l -> Sequence l + + let rec merge_sequences = function + | [] -> [] + | Ast (Alternative l') :: r -> merge_sequences (l' @ r) + | Sequence (x :: y) :: r -> ( + match merge_sequences r with + | Sequence (x' :: y') :: r' when equal Cset.equal x x' -> + Sequence [ x; Ast (Alternative [ seq y; seq y' ]) ] :: r' + | r' -> Sequence (x :: y) :: r') + | x :: r -> x :: merge_sequences r + + (*XXX Use a better algorithm allowing non-contiguous regions? *) + + let colorize color_map (regexp : no_case) = + let lnl = ref false in + let rec colorize regexp = + match (regexp : no_case) with + | Set s -> Color_map.split color_map s + | Sequence l -> List.iter ~f:colorize l + | Ast (Alternative l) -> List.iter ~f:colorize l + | Repeat (r, _, _) -> colorize r + | Beg_of_line | End_of_line -> Color_map.split color_map Cset.nl + | Beg_of_word | End_of_word | Not_bound -> + Color_map.split color_map Cset.cword + | Beg_of_str | End_of_str | Start | Stop -> () + | Last_end_of_line -> lnl := true + | No_group r | Group (_, r) | Nest r | Pmark (_, r) -> colorize r + | Sem (_, r) | Sem_greedy (_, r) -> colorize r + in + colorize regexp; + !lnl + + let rec anchored_ast : (t, _) ast -> bool = function + | Alternative als -> List.for_all ~f:anchored als + | No_case r | Case r -> anchored r + + and anchored : t -> bool = function + | Ast a -> anchored_ast a + | Sequence l -> List.exists ~f:anchored l + | Repeat (r, i, _) -> i > 0 && anchored r + | No_group r + | Sem (_, r) + | Sem_greedy (_, r) + | Group (_, r) + | Nest r + | Pmark (_, r) -> + anchored r + | Set _ | Beg_of_line | End_of_line | Beg_of_word | End_of_word | Not_bound + | End_of_str | Last_end_of_line | Stop -> + false + | Beg_of_str | Start -> true + + let t_of_cset x = Set x +end diff --git a/lib/unicode/ast.mli b/lib/unicode/ast.mli new file mode 100644 index 00000000..9669ddd1 --- /dev/null +++ b/lib/unicode/ast.mli @@ -0,0 +1,115 @@ +type ('a, _) ast = private + | Alternative : 'a list -> ('a, [> `Uncased ]) ast + | No_case : 'a -> ('a, [> `Cased ]) ast + | Case : 'a -> ('a, [> `Cased ]) ast + +type ('a, 'case) gen = private + | Set of 'a + | Ast of (('a, 'case) gen, 'case) ast + | Sequence of ('a, 'case) gen list + | Repeat of ('a, 'case) gen * int * int option + | Beg_of_line + | End_of_line + | Beg_of_word + | End_of_word + | Not_bound + | Beg_of_str + | End_of_str + | Last_end_of_line + | Start + | Stop + | Group of string option * ('a, 'case) gen + | No_group of ('a, 'case) gen + | Nest of ('a, 'case) gen + | Pmark of Pmark.t * ('a, 'case) gen + | Sem of Automata.Sem.t * ('a, 'case) gen + | Sem_greedy of Automata.Rep_kind.t * ('a, 'case) gen + +module type Export = sig + type nonrec letter + type nonrec t + + val empty : t + val epsilon : t + val str : string -> t + val char : char -> letter + val int : int -> letter + val no_case : t -> t + val case : t -> t + val diff : t -> t -> t + val compl : t list -> t + val repn : t -> int -> int option -> t + val inter : t list -> t + val letter : letter -> t + val any : t + val set : string -> t + val mark : t -> Pmark.t * t + val nest : t -> t + val no_group : t -> t + val whole_string : t -> t + val leol : t + val longest : t -> t + val greedy : t -> t + val non_greedy : t -> t + val stop : t + val not_boundary : t + val group : ?name:string -> t -> t + val word : t -> t + val first : t -> t + val bos : t + val bow : t + val eow : t + val eos : t + val bol : t + val start : t + val eol : t + val opt : t -> t + val rep : t -> t + val rep1 : t -> t + val alt : t list -> t + val shortest : t -> t + val seq : t list -> t + val pp : t Fmt.t + val witness : t -> string +end + +module type T = sig + type letter + type cset_t + type color_map_t + + type cset = + | Cset of cset_t + | Intersection of cset list + | Complement of cset list + | Difference of cset * cset + | Cast of (cset, [ `Cased | `Uncased ]) ast + + type t = (cset, [ `Cased | `Uncased ]) gen + type no_case = (cset_t, [ `Uncased ]) gen + + val to_dyn : t -> Dyn.t + val pp : t Fmt.t + + val merge_sequences : + (cset_t, [ `Uncased ]) gen list -> (cset_t, [ `Uncased ]) gen list + + val handle_case : bool -> t -> (cset_t, [ `Uncased ]) gen + val anchored : t -> bool + val colorize : color_map_t -> (cset_t, [ `Uncased ]) gen -> bool + val cset : cset_t -> t + val t_of_cset : cset -> t + + module Export : Export with type t = t and type letter = letter +end + +module Make + (Cset : Cset.T) + (Color_map : + Color_map.T with type cp = Cset.cp + and type letter = Cset.letter + and type cset_t = Cset.t) : + T + with type letter = Cset.letter + and type cset_t = Cset.t + and type color_map_t = Color_map.t diff --git a/lib/unicode/automata.ml b/lib/unicode/automata.ml new file mode 100644 index 00000000..addafb33 --- /dev/null +++ b/lib/unicode/automata.ml @@ -0,0 +1,783 @@ +open Import + +(* + RE - A regular expression library + + Copyright (C) 2001 Jerome Vouillon + email: Jerome.Vouillon@pps.jussieu.fr + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation, with + linking exception; either version 2.1 of the License, or (at + your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +*) + +let hash_combine h accu = (accu * 65599) + h + +module Ids : sig + module Id : sig + type t + + val equal : t -> t -> bool + val zero : t + val hash : t -> int + val pp : t Fmt.t + + module Hash_set : sig + type id := t + type t + + val create : unit -> t + val mem : t -> id -> bool + val add : t -> id -> unit + val clear : t -> unit + end + end + + type t + + val create : unit -> t + val next : t -> Id.t +end = struct + module Id = struct + type t = int + + module Hash_set = Hash_set + + let equal = Int.equal + let zero = 0 + let hash x = x + let pp = Fmt.int + end + + type t = int ref + + let create () = ref 0 + + let next t = + incr t; + !t +end + +module Id = Ids.Id + +module Sem = struct + type t = [ `Longest | `Shortest | `First ] + + let to_string = function + | `Shortest -> "short" + | `Longest -> "long" + | `First -> "first" + + let to_dyn t = Dyn.enum (to_string t) + let equal = Poly.equal + let pp ch k = Format.pp_print_string ch (to_string k) +end + +module Rep_kind = struct + type t = [ `Greedy | `Non_greedy ] + + let to_string = function `Greedy -> "Greedy" | `Non_greedy -> "Non_greedy" + let to_dyn t = Dyn.enum (to_string t) + let pp fmt t = Format.pp_print_string fmt (to_string t) +end + +module Mark : sig + type t = private int + + val compare : t -> t -> int + val equal : t -> t -> bool + val pp : t Fmt.t + val to_dyn : t -> Dyn.t + val start : t + val prev : t -> t + val next : t -> t + val next2 : t -> t + val group_count : t -> int + val outside_range : t -> start_inclusive:t -> stop_inclusive:t -> bool +end = struct + type t = int + + let equal = Int.equal + let compare = Int.compare + let pp = Format.pp_print_int + let to_dyn = Dyn.int + let start = 0 + let prev x = pred x + let next x = succ x + let next2 x = x + 2 + let group_count x = x / 2 + + let outside_range t ~start_inclusive ~stop_inclusive = + t < start_inclusive || t > stop_inclusive +end + +module Idx : sig + type t = private int + + val pp : t Fmt.t + val to_dyn : t -> Dyn.t + val to_int : t -> int + val unknown : t + val initial : t + val used : t -> bool + val make : int -> t + val equal : t -> t -> bool +end = struct + type t = int + + let to_dyn = Dyn.int + let to_int x = x + let pp = Format.pp_print_int + let used t = t >= 0 + let make x = x + let equal = Int.equal + let unknown = -1 + let initial = 0 +end + +module Marks = struct + type t = { marks : (Mark.t * Idx.t) list; pmarks : Pmark.Set.t } + + let to_dyn { marks; pmarks } : Dyn.t = + let open Dyn in + record + [ + ( "marks", + List.map marks ~f:(fun (m, idx) -> + pair (Mark.to_dyn m) (Idx.to_dyn idx)) + |> list ); + ("pmarks", Pmark.Set.to_list pmarks |> List.map ~f:Pmark.to_dyn |> list); + ] + + let equal { marks; pmarks } t = + List.equal + ~eq:(fun (x, y) (x', y') -> Mark.equal x x' && Idx.equal y y') + marks t.marks + && Pmark.Set.equal pmarks t.pmarks + + let empty = { marks = []; pmarks = Pmark.Set.empty } + + let hash_marks_offset = + let f acc ((a : Mark.t), (i : Idx.t)) = + hash_combine (a :> int) (hash_combine (i :> int) acc) + in + fun l init -> List.fold_left l ~init ~f + + let hash m accu = + hash_marks_offset m.marks (hash_combine (Hashtbl.hash m.pmarks) accu) + + let marks_set_idx = + let rec marks_set_idx idx marks = + match marks with + | [] -> [] + | (a, idx') :: rem -> + if Idx.equal idx' Idx.unknown then (a, idx) :: marks_set_idx idx rem + else marks + in + fun marks idx -> { marks with marks = marks_set_idx idx marks.marks } + + let filter t (b : Mark.t) (e : Mark.t) = + { + t with + marks = + List.filter t.marks ~f:(fun ((i : Mark.t), _) -> + Mark.outside_range i ~start_inclusive:b ~stop_inclusive:e); + } + + let set_mark t (i : Mark.t) = + { t with marks = (i, Idx.unknown) :: List.remove_assq i t.marks } + + let set_pmark t i = { t with pmarks = Pmark.Set.add i t.pmarks } + + let pp fmt { marks; pmarks } = + Format.pp_open_box fmt 1; + (match marks with + | [] -> () + | _ :: _ -> + Format.fprintf fmt "@[<2>marks@ %a@]" + (Format.pp_print_list (fun fmt (a, i) -> + Format.fprintf fmt "%a-%a" Mark.pp a Idx.pp i)) + marks); + (match Pmark.Set.to_list pmarks with + | [] -> () + | pmarks -> + Format.fprintf fmt "@[<2>pmarks %a@]" + (Format.pp_print_list Pmark.pp) + pmarks); + Format.pp_close_box fmt () +end + +module Status = struct + type t = Failed | Match of Mark_infos.t * Pmark.Set.t | Running +end + +module type T = sig + type cset + type cp + type category + type expr + + val is_eps : expr -> bool + val pp : expr Fmt.t + val cst : Ids.t -> cset -> expr + val empty : Ids.t -> expr + val alt : Ids.t -> expr list -> expr + val seq : Ids.t -> Sem.t -> expr -> expr -> expr + val eps : Ids.t -> expr + val rep : Ids.t -> Rep_kind.t -> Sem.t -> expr -> expr + val mark : Ids.t -> Mark.t -> expr + val pmark : Ids.t -> Pmark.t -> expr + val erase : Ids.t -> Mark.t -> Mark.t -> expr + val before : Ids.t -> category -> expr + val after : Ids.t -> category -> expr + val rename : Ids.t -> expr -> expr + + (****) + + (* States of the automata *) + + module State : sig + type t + + val pp : t Fmt.t + val dummy : t + val create : category -> expr -> t + val idx : t -> Idx.t + val status_no_mutex : t -> Status.t + val status : Mutex.t -> t -> Status.t + val to_dyn : t -> Dyn.t + + module Table : Hashtbl.S with type key = t + end + + (****) + + (* Computation of the states following a given state *) + + module Working_area : sig + type t + + val create : unit -> t + val index_count : t -> int + end + + val delta : Working_area.t -> category -> cp -> State.t -> State.t +end + +(* implementation *) +module Make (Cset : Cset.T) = struct + type cset = Cset.t + type cp = Cset.cp + + module Category = Category.Make (Cset) + + type category = Category.t + + module Expr = struct + type t = { id : Id.t; def : def } + + and def = + | Cst of Cset.t + | Alt of t list + | Seq of Sem.t * t * t + | Eps + | Rep of Rep_kind.t * Sem.t * t + | Mark of Mark.t + | Erase of Mark.t * Mark.t + | Before of Category.t + | After of Category.t + | Pmark of Pmark.t + + let wrap_sem sem sem' v = + let open Dyn in + let name = Sem.to_string sem' in + match sem with + | Some sem when Sem.equal sem sem' -> v + | None | Some _ -> ( + match v with List v -> variant name v | _ -> variant name [ v ]) + + let rec seq_as_list sem = function + | Eps -> [] + | Cst cs -> [ Cst cs ] + | Seq (sem', x, y) -> + if Sem.equal sem sem' then x.def :: seq_as_list sem y.def + else raise_notrace Not_found + | _ -> raise_notrace Not_found + + let seq_as_list sem t = + match seq_as_list sem t with exception Not_found -> None | s -> Some s + + let rec dyn_of_def sem = + let open Dyn in + function + | Cst cset -> Cset.to_dyn cset + | Alt alt -> variant "Alt" (List.map ~f:(to_dyn sem) alt) + | Seq (sem', x, y) -> + let to_dyn = to_dyn (Some sem') in + let x = + match seq_as_list sem' y.def with + | None -> variant "Seq" [ to_dyn x; to_dyn y ] + | Some y -> variant "Seq" (to_dyn x :: List.map y ~f:(dyn_of_def sem)) + in + wrap_sem sem sem' x + | Eps -> Enum "Eps" + | Rep (_, sem', t) -> + wrap_sem sem sem' (variant "Rep" [ to_dyn (Some sem') t ]) + | Mark m -> variant "Mark" [ Mark.to_dyn m ] + | Pmark m -> variant "Pmark" [ Pmark.to_dyn m ] + | Erase (x, y) -> variant "Erase" [ Mark.to_dyn x; Mark.to_dyn y ] + | Before c -> variant "Before" [ Category.to_dyn c ] + | After c -> variant "After" [ Category.to_dyn c ] + + and to_dyn sem { id = _; def } = dyn_of_def sem def + + let rec pp_with_sem sem ch e = + let open Fmt in + match e.def with + | Cst l -> sexp ch "cst" Cset.pp l + | Alt l -> sexp ch "alt" (list (pp_with_sem sem)) l + | Seq (k, e, e') -> + sexp ch "seq" + (triple Sem.pp (pp_with_sem sem) (pp_with_sem sem)) + (k, e, e') + | Eps -> str ch "eps" + | Rep (_rk, k, e) -> + sexp ch "rep" (pair Sem.pp (pp_with_sem (Some k))) (k, e) + | Mark i -> sexp ch "mark" Mark.pp i + | Pmark i -> sexp ch "pmark" Pmark.pp i + | Erase (b, e) -> sexp ch "erase" (pair Mark.pp Mark.pp) (b, e) + | Before c -> sexp ch "before" Category.pp c + | After c -> sexp ch "after" Category.pp c + + let pp = pp_with_sem None + let eps_expr = { id = Id.zero; def = Eps } + let mk ids def = { id = Ids.next ids; def } + let empty ids = mk ids (Alt []) + let cst ids s = if Cset.is_empty s then empty ids else mk ids (Cst s) + let eps ids = mk ids Eps + let rep ids kind sem x = mk ids (Rep (kind, sem, x)) + let mark ids m = mk ids (Mark m) + let pmark ids i = mk ids (Pmark i) + let erase ids m m' = mk ids (Erase (m, m')) + let before ids c = mk ids (Before c) + let after ids c = mk ids (After c) + let alt ids = function [] -> empty ids | [ c ] -> c | l -> mk ids (Alt l) + + let seq ids (kind : Sem.t) x y = + match (x.def, y.def) with + | Alt [], _ -> x + | _, Alt [] -> y + | Eps, _ -> y + | _, Eps when Sem.equal kind `First -> x + | _ -> mk ids (Seq (kind, x, y)) + + let is_eps expr = match expr.def with Eps -> true | _ -> false + + let rec rename ids x = + match x.def with + | Cst _ | Eps | Mark _ | Pmark _ | Erase _ | Before _ | After _ -> + mk ids x.def + | Alt l -> mk ids (Alt (List.map ~f:(rename ids) l)) + | Seq (k, y, z) -> mk ids (Seq (k, rename ids y, rename ids z)) + | Rep (g, k, y) -> mk ids (Rep (g, k, rename ids y)) + end + + type expr = Expr.t + + include Expr + + module Desc : sig + type t + + val pp : t Fmt.t + + module E : sig + type nonrec t = private + | TSeq of Sem.t * t * Expr.t + | TExp of Marks.t * Expr.t + | TMatch of Marks.t + end + + val to_dyn : t -> Dyn.t + val fold_right : t -> init:'acc -> f:(E.t -> 'acc -> 'acc) -> 'acc + val tseq : Sem.t -> t -> Expr.t -> t -> t + val initial : Expr.t -> t + val empty : t + val set_idx : Idx.t -> t -> t + val hash : t -> int -> int + val equal : t -> t -> bool + val status : t -> Status.t + val first_match : t -> Marks.t option + val remove_matches : t -> t + val split_at_match : t -> t * t + val add_match : t -> Marks.t -> t + val add_eps : t -> Marks.t -> t + val add_expr : t -> E.t -> t + val iter_marks : t -> f:(Marks.t -> unit) -> unit + val remove_duplicates : Id.Hash_set.t -> t -> Expr.t -> t + end = struct + module E = struct + type t = + | TSeq of Sem.t * t list * Expr.t + | TExp of Marks.t * Expr.t + | TMatch of Marks.t + + let rec equal_list l1 l2 = List.equal ~eq:equal l1 l2 + + and equal x y = + match (x, y) with + | TSeq (_, l1, e1), TSeq (_, l2, e2) -> + Id.equal e1.id e2.id && equal_list l1 l2 + | TExp (marks1, e1), TExp (marks2, e2) -> + Id.equal e1.id e2.id && Marks.equal marks1 marks2 + | TMatch marks1, TMatch marks2 -> Marks.equal marks1 marks2 + | _, _ -> false + + let rec hash (t : t) accu = + match t with + | TSeq (_, l, e) -> + hash_combine 0x172a1bce + (hash_combine (Id.hash e.id) (hash_list l accu)) + | TExp (marks, e) -> + hash_combine 0x2b4c0d77 + (hash_combine (Id.hash e.id) (Marks.hash marks accu)) + | TMatch marks -> hash_combine 0x1c205ad5 (Marks.hash marks accu) + + and hash_list = + let f acc x = hash x acc in + fun l init -> List.fold_left l ~init ~f + end + + type t = E.t list + + let rec to_dyn sem t = Dyn.list (List.map ~f:(dyn_of_e sem) t) + + and dyn_of_e sem = + let open Dyn in + function + | E.TSeq (sem', x, y) -> + wrap_sem sem sem' + (variant "TSeq" [ to_dyn (Some sem') x; Expr.to_dyn (Some sem') y ]) + | TExp (marks, e) -> + let e = + let base = [ Expr.to_dyn sem e ] in + if Marks.(equal empty marks) then base else Marks.to_dyn marks :: base + in + variant "TExp" e + | TMatch m -> variant "TMarks" [ Marks.to_dyn m ] + + let to_dyn = to_dyn None + + open E + + let equal = E.equal_list + let hash = E.hash_list + + let tseq' kind x y = + match x with + | [] -> [] + | [ TExp (marks, { def = Eps; _ }) ] -> [ TExp (marks, y) ] + | _ -> [ TSeq (kind, x, y) ] + + let tseq kind x y rem = tseq' kind x y @ rem + + let rec fold_right t ~init ~f = + match t with [] -> init | x :: xs -> f x (fold_right xs ~init ~f) + + let rec iter_marks t ~f = + List.iter t ~f:(fun (e : E.t) -> + match e with + | TSeq (_, l, _) -> iter_marks l ~f + | TExp (marks, _) | TMatch marks -> f marks) + + let rec print_state_rec ch e (y : Expr.t) = + match e with + | TMatch marks -> Format.fprintf ch "@[<2>(TMatch@ %a)@]" Marks.pp marks + | TSeq (sem, l', x) -> + Format.fprintf ch "@[<2>(TSeq@ %a@ " Sem.pp sem; + print_state_lst ch l' x; + Format.fprintf ch "@ %a)@]" Expr.pp x + | TExp (marks, { def = Eps; _ }) -> + Format.fprintf ch "@[<2>(TExp@ %a@ (%a)@ (eps))@]" Id.pp y.id Marks.pp + marks + | TExp (marks, x) -> + Format.fprintf ch "@[<2>(TExp@ %a@ (%a)@ %a)@]" Id.pp x.id Marks.pp + marks Expr.pp x + + and print_state_lst ch l y = + match l with + | [] -> Format.fprintf ch "()" + | e :: rem -> + print_state_rec ch e y; + List.iter rem ~f:(fun e -> + Format.fprintf ch "@ | "; + print_state_rec ch e y) + + let pp ch t = print_state_lst ch [ t ] { id = Id.zero; def = Eps } + + let rec first_match = function + | [] -> None + | TMatch marks :: _ -> Some marks + | _ :: r -> first_match r + + let remove_matches = + List.filter ~f:(function TMatch _ -> false | _ -> true) + + let split_at_match = + let rec split_at_match_rec l = function + | [] -> assert false + | TMatch _ :: r -> (List.rev l, remove_matches r) + | x :: r -> split_at_match_rec (x :: l) r + in + fun l -> split_at_match_rec [] l + + let status : _ -> Status.t = function + | [] -> Failed + | TMatch m :: _ -> + Match (Mark_infos.make (m.marks :> (int * int) list), m.pmarks) + | _ -> Running + + let set_idx = + let rec f idx = function + | TMatch marks -> TMatch (Marks.marks_set_idx marks idx) + | TSeq (kind, l, x) -> TSeq (kind, set_idx idx l, x) + | TExp (marks, x) -> TExp (Marks.marks_set_idx marks idx, x) + and set_idx idx xs = List.map xs ~f:(f idx) in + set_idx + + let[@ocaml.warning "-32"] pp fmt t = + Format.fprintf fmt "[%a]" + (Format.pp_print_list ~pp_sep:(Fmt.lit "; ") pp) + t + + let empty = [] + let initial expr = [ TExp (Marks.empty, expr) ] + let add_match t marks = TMatch marks :: t + let add_eps t marks = TExp (marks, eps_expr) :: t + let add_expr t expr = expr :: t + + let remove_duplicates = + let rec loop seen l y = + match l with + | [] -> [] + | (TMatch _ as x) :: _ -> + (* Truncate after first match *) + [ x ] + | TSeq (kind, l, x) :: r -> + let l = loop seen l x in + let r = loop seen r y in + tseq kind l x r + | (TExp (_marks, { def = Eps; _ }) as e) :: r -> + if Id.Hash_set.mem seen y.id then loop seen r y + else ( + Id.Hash_set.add seen y.id; + e :: loop seen r y) + | (TExp (_marks, x) as e) :: r -> + if Id.Hash_set.mem seen x.id then loop seen r y + else ( + Id.Hash_set.add seen x.id; + e :: loop seen r y) + in + fun seen l y -> + Id.Hash_set.clear seen; + loop seen l y + end + + module E = Desc.E + + module State = struct + type t = { + idx : Idx.t; + category : Category.t; + desc : Desc.t; + mutable status : Status.t option; + hash : int; + } + (* Thread-safety: We use double-checked locking to access field + [status] in function [status] below. *) + + let pp fmt t = Desc.pp fmt t.desc + let[@inline] idx t = t.idx + let to_dyn t = Desc.to_dyn t.desc + + let dummy = + { + idx = Idx.unknown; + category = Category.dummy; + desc = Desc.empty; + status = None; + hash = -1; + } + + let hash idx cat desc = + Desc.hash desc (hash_combine idx (hash_combine (Category.to_int cat) 0)) + land 0x3FFFFFFF + + let mk idx cat desc = + { + idx; + category = cat; + desc; + status = None; + hash = hash (idx :> int) cat desc; + } + + let create cat e = mk Idx.initial cat (Desc.initial e) + + let equal { idx; category; desc; status = _; hash } t = + Int.equal hash t.hash && Idx.equal idx t.idx + && Category.equal category t.category + && Desc.equal desc t.desc + + (* To be called when the mutex has already been acquired *) + let status_no_mutex s = + match s.status with + | Some s -> s + | None -> + let st = Desc.status s.desc in + s.status <- Some st; + st + + let status m s = + match s.status with + | Some s -> s + | None -> + Mutex.lock m; + let st = status_no_mutex s in + Mutex.unlock m; + st + + module Table = Hashtbl.Make (struct + type nonrec t = t + + let equal = equal + let hash t = t.hash + end) + end + + (**** Find a free index ****) + + module Working_area = struct + type t = { + mutable ids : Bit_vector.t; + seen : Id.Hash_set.t; + index_count : int Atomic.t; + } + + let create () = + { + ids = Bit_vector.create_zero 1; + seen = Id.Hash_set.create (); + index_count = Atomic.make 0; + } + + let index_count w = Atomic.get w.index_count + + let mark_used_indices tbl = + Desc.iter_marks ~f:(fun marks -> + List.iter marks.marks ~f:(fun (_, i) -> + if Idx.used i then Bit_vector.set tbl (i :> int) true)) + + let rec find_free tbl idx len = + if idx = len || not (Bit_vector.get tbl idx) then idx + else find_free tbl (idx + 1) len + + let free_index t l = + Bit_vector.reset_zero t.ids; + mark_used_indices t.ids l; + let len = Bit_vector.length t.ids in + let idx = find_free t.ids 0 len in + if idx = len then ( + t.ids <- Bit_vector.create_zero (2 * len); + (* This function is only called when the mutex is locked. So we + are sure that this is always coherent with the length of + [t.ids]. *) + Atomic.set t.index_count (2 * len)); + Idx.make idx + end + + (**** Computation of the next state ****) + + type ctx = { cp : Cset.cp; prev_cat : Category.t; next_cat : Category.t } + + let rec delta_expr ({ cp; _ } as ctx) marks (x : Expr.t) rem = + (*Format.eprintf "%d@." x.id;*) + match x.def with + | Cst s -> if Cset.mem cp s then Desc.add_eps rem marks else rem + | Alt l -> delta_alt ctx marks l rem + | Seq (kind, y, z) -> + let y = delta_expr ctx marks y Desc.empty in + delta_seq ctx kind y z rem + | Rep (rep_kind, kind, y) -> delta_rep ctx marks x rep_kind kind y rem + | Eps -> Desc.add_match rem marks + | Mark i -> Desc.add_match rem (Marks.set_mark marks i) + | Pmark i -> Desc.add_match rem (Marks.set_pmark marks i) + | Erase (b, e) -> Desc.add_match rem (Marks.filter marks b e) + | Before cat -> + if Category.intersect ctx.next_cat cat then Desc.add_match rem marks + else rem + | After cat -> + if Category.intersect ctx.prev_cat cat then Desc.add_match rem marks + else rem + + and delta_rep ctx marks x rep_kind kind y rem = + let y, marks' = + let y = delta_expr ctx marks y Desc.empty in + match Desc.first_match y with + | None -> (y, marks) + | Some marks -> (Desc.remove_matches y, marks) + in + match rep_kind with + | `Greedy -> Desc.tseq kind y x (Desc.add_match rem marks') + | `Non_greedy -> Desc.add_match (Desc.tseq kind y x rem) marks + + and delta_alt ctx marks l rem = + List.fold_right l ~init:rem ~f:(delta_expr ctx marks) + + and delta_seq ctx (kind : Sem.t) y z rem = + match Desc.first_match y with + | None -> Desc.tseq kind y z rem + | Some marks -> ( + match kind with + | `Longest -> + Desc.tseq kind (Desc.remove_matches y) z (delta_expr ctx marks z rem) + | `Shortest -> + delta_expr ctx marks z (Desc.tseq kind (Desc.remove_matches y) z rem) + | `First -> + let y, y' = Desc.split_at_match y in + Desc.tseq kind y z (delta_expr ctx marks z (Desc.tseq kind y' z rem))) + + let rec delta_e ctx marks (x : E.t) rem = + match x with + | TSeq (kind, y, z) -> + let y = delta_desc ctx marks y Desc.empty in + delta_seq ctx kind y z rem + | TExp (marks, e) -> delta_expr ctx marks e rem + | TMatch _ -> Desc.add_expr rem x + + and delta_desc ctx marks (l : Desc.t) rem = + Desc.fold_right l ~init:rem ~f:(fun y acc -> delta_e ctx marks y acc) + + let delta (tbl_ref : Working_area.t) next_cat letter (st : State.t) = + let expr = + let prev_cat = st.category in + let ctx = { cp = letter; next_cat; prev_cat } in + Desc.remove_duplicates tbl_ref.seen + (delta_desc ctx Marks.empty st.desc Desc.empty) + Expr.eps_expr + in + let idx = Working_area.free_index tbl_ref expr in + let expr = Desc.set_idx idx expr in + State.mk idx next_cat expr +end diff --git a/lib/unicode/automata.mli b/lib/unicode/automata.mli new file mode 100644 index 00000000..ce19d289 --- /dev/null +++ b/lib/unicode/automata.mli @@ -0,0 +1,123 @@ +(* + RE - A regular expression library + + Copyright (C) 2001 Jerome Vouillon + email: Jerome.Vouillon@pps.jussieu.fr + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation, with + linking exception; either version 2.1 of the License, or (at + your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +*) + +(* Regular expressions *) + +module Ids : sig + type t + + val create : unit -> t +end + +module Sem : sig + type t = [ `Longest | `Shortest | `First ] + + val to_dyn : t -> Dyn.t + val pp : t Fmt.t +end + +module Rep_kind : sig + type t = [ `Greedy | `Non_greedy ] + + val to_dyn : t -> Dyn.t + val pp : t Fmt.t +end + +module Mark : sig + type t = private int + + val compare : t -> t -> int + val equal : t -> t -> bool + val pp : t Fmt.t + val to_dyn : t -> Dyn.t + val start : t + val prev : t -> t + val next : t -> t + val next2 : t -> t + val group_count : t -> int + val outside_range : t -> start_inclusive:t -> stop_inclusive:t -> bool +end + +module Idx : sig + type t + + val to_int : t -> int +end + +module Status : sig + type t = Failed | Match of Mark_infos.t * Pmark.Set.t | Running +end + +module type T = sig + type cset + type cp + type category + type expr + + val is_eps : expr -> bool + val pp : expr Fmt.t + val cst : Ids.t -> cset -> expr + val empty : Ids.t -> expr + val alt : Ids.t -> expr list -> expr + val seq : Ids.t -> Sem.t -> expr -> expr -> expr + val eps : Ids.t -> expr + val rep : Ids.t -> Rep_kind.t -> Sem.t -> expr -> expr + val mark : Ids.t -> Mark.t -> expr + val pmark : Ids.t -> Pmark.t -> expr + val erase : Ids.t -> Mark.t -> Mark.t -> expr + val before : Ids.t -> category -> expr + val after : Ids.t -> category -> expr + val rename : Ids.t -> expr -> expr + + (****) + + (* States of the automata *) + + module State : sig + type t + + val pp : t Fmt.t + val dummy : t + val create : category -> expr -> t + val idx : t -> Idx.t + val status_no_mutex : t -> Status.t + val status : Mutex.t -> t -> Status.t + val to_dyn : t -> Dyn.t + + module Table : Hashtbl.S with type key = t + end + + (****) + + (* Computation of the states following a given state *) + + module Working_area : sig + type t + + val create : unit -> t + val index_count : t -> int + end + + val delta : Working_area.t -> category -> cp -> State.t -> State.t +end + +module Make (Cset : Cset.T) : T with type cset = Cset.t and type cp = Cset.cp and type category = Category.Make(Cset).t diff --git a/lib/unicode/bit_vector.ml b/lib/unicode/bit_vector.ml new file mode 100644 index 00000000..06f69a8c --- /dev/null +++ b/lib/unicode/bit_vector.ml @@ -0,0 +1,48 @@ +type t = + { len : int + ; bits : Bytes.t + } + +let byte s i = Char.code (Bytes.unsafe_get s i) +let set_byte s i x = Bytes.unsafe_set s i (Char.chr x) +let length t = t.len + +let unsafe_set v n b = + let i = n lsr 3 in + let c = byte v.bits i in + let mask = 1 lsl (n land 7) in + set_byte v.bits i (if b then c lor mask else c land lnot mask) +;; + +let set v n b = + if n < 0 || n >= v.len then invalid_arg "Bit_vector.set"; + unsafe_set v n b +;; + +let unsafe_get v n = + let i = n lsr 3 in + byte v.bits i land (1 lsl (n land 7)) > 0 +;; + +let get v n = + if n < 0 || n >= v.len then invalid_arg "Bit_vector.get"; + unsafe_get v n +;; + +let reset_zero t = Bytes.fill t.bits 0 (Bytes.length t.bits) '\000' + +let create_zero len = + let bits = + let r = len land 7 in + let q = len lsr 3 in + let len = if r = 0 then q else q + 1 in + Bytes.make len '\000' + in + { len; bits } +;; + +let pp fmt { len; bits } = + let len fmt () = Fmt.sexp fmt "len" Fmt.int len in + let bits fmt () = Fmt.sexp fmt "bits" Fmt.bytes bits in + Format.fprintf fmt "%a@.%a@." len () bits () +;; diff --git a/lib/unicode/bit_vector.mli b/lib/unicode/bit_vector.mli new file mode 100644 index 00000000..8f07063f --- /dev/null +++ b/lib/unicode/bit_vector.mli @@ -0,0 +1,8 @@ +type t + +val length : t -> int +val set : t -> int -> bool -> unit +val create_zero : int -> t +val get : t -> int -> bool +val reset_zero : t -> unit +val pp : t Fmt.t diff --git a/lib/unicode/category.ml b/lib/unicode/category.ml new file mode 100644 index 00000000..0e63091f --- /dev/null +++ b/lib/unicode/category.ml @@ -0,0 +1,62 @@ +module type T = sig + type letter + type t [@@immediate] + + val ( ++ ) : t -> t -> t + val dummy : t + val inexistant : t + val letter : t + val not_letter : t + val newline : t + val lastnewline : t + val search_boundary : t + val to_int : t -> int + val equal : t -> t -> bool + val compare : t -> t -> int + val intersect : t -> t -> bool + val pp : t Fmt.t + val to_dyn : t -> Dyn.t + val from_letter : letter -> t +end + +module Make (Cset : Cset.T) : T with type letter = Cset.letter = struct + type letter = Cset.letter + type t = int + + let equal (x : int) (y : int) = x = y + let compare (x : int) (y : int) = compare x y + let to_int x = x + let pp = Format.pp_print_int + let intersect x y = x land y <> 0 + let ( ++ ) x y = x lor y + let dummy = -1 + let inexistant = 1 + let letter = 2 + let not_letter = 4 + let newline = 8 + let lastnewline = 16 + let search_boundary = 32 + let to_dyn = Dyn.int + + let from_letter = + fun l -> + let c = Cset.CodePage.from_letter l in + match Cset.mem c Cset.cword with + | true -> letter + | _ -> ( + match Cset.mem c Cset.nl with + | true -> not_letter ++ newline + | _ -> not_letter) + + (* function + (* Should match [cword] definition *) + | 'a' .. 'z' + | 'A' .. 'Z' + | '0' .. '9' + | '_' | '\170' | '\181' | '\186' + | '\192' .. '\214' + | '\216' .. '\246' + | '\248' .. '\255' -> letter + | '\n' -> not_letter ++ newline + | _ -> not_letter *) +end diff --git a/lib/unicode/category.mli b/lib/unicode/category.mli new file mode 100644 index 00000000..18035e05 --- /dev/null +++ b/lib/unicode/category.mli @@ -0,0 +1,27 @@ +(** Categories represent the various kinds of characters that can be tested by + look-ahead and look-behind operations. + + This is more restricted than Cset, but faster. *) + +module type T = sig + type letter + type t [@@immediate] + + val ( ++ ) : t -> t -> t + val dummy : t + val inexistant : t + val letter : t + val not_letter : t + val newline : t + val lastnewline : t + val search_boundary : t + val to_int : t -> int + val equal : t -> t -> bool + val compare : t -> t -> int + val intersect : t -> t -> bool + val pp : t Fmt.t + val to_dyn : t -> Dyn.t + val from_letter : letter -> t +end + +module Make (Cset : Cset.T) : T with type letter = Cset.letter diff --git a/lib/unicode/color_map.ml b/lib/unicode/color_map.ml new file mode 100644 index 00000000..625a3ff9 --- /dev/null +++ b/lib/unicode/color_map.ml @@ -0,0 +1,259 @@ +(* In reality, this can really be represented as a bool array. + + The representation is best thought of as array of the letters present in the regular expression + (which is a Cset.t): + + (a, 0), (b, 1), (c, 0), (d, 0), ... + + characters belonging to the same color are represented by sequnces of + characters with the flag set to 0. +*) + +module type T = sig + type cp + type letter + type cset_t + type color = int + type t + + module Repr : sig + type t + + val repr : t -> color -> letter + val length : t -> int + val pp : Format.formatter -> t -> unit + end + + module Table : sig + type t + + val get : t -> letter -> cp + val get_letter : t -> cp -> letter + val translate_colors : t -> cset_t -> cset_t + val pp : Format.formatter -> t -> unit + end + + val make : unit -> t + val flatten : t -> Table.t * Repr.t + val split : t -> cset_t -> unit + val pp : Format.formatter -> t -> unit +end + +module Make (Cset : Cset.T) = struct + type letter = Cset.letter + type cp = Cset.cp + type cset_t = Cset.t + type color = int + type t = int array ref + + let make () : t = ref [||] + let mem cp t = Array.mem cp !t + + (* t shall be ordered *) + let add cp (t : t) = + if mem cp t then () + else + let len = Array.length !t in + let r = Array.make (len + 1) (-1) in + let rec iter i max = + if i >= max then ( + Array.blit !t 0 r 0 i; + r.(i) <- cp; + t := r) + else if Int.compare cp !t.(i) < 0 then ( + Array.blit !t 0 r 0 i; + r.(i) <- cp; + Array.blit !t i r (i + 1) (len - i); + t := r) + else iter (succ i) max + in + iter 0 len + + let pp ppf t = + Format.pp_print_list + ~pp_sep:(fun fmt () -> Format.fprintf fmt ",@ ") + (fun fmt cp -> Format.fprintf fmt "%d" cp) + ppf (Array.to_list !t) + + let binary_search comp i v = + let rec loop start finish = + if finish < start || start > finish then None + else + let m = start + ((finish - start) / 2) in + match comp i (fst v.(m)) with + | 0 -> Some m + | 1 -> loop (m + 1) finish + | _ -> loop start (m - 1) + in + loop 0 (Array.length v - 1) + + let cmp_interval i (min, max) = + if Int.compare i min >= 0 && Int.compare i max <= 0 then 0 + else if Int.compare i min <= 0 then -1 + else 1 + + module Repr = struct + type t = int array + + let pp ppf (t : t) = + Format.pp_print_list + ~pp_sep:(fun fmt () -> Format.fprintf fmt ",@ ") + (fun fmt x -> Format.fprintf fmt "%d" x) + ppf (Array.to_list t) + + let repr (t : t) (color : color) = + if Array.length t = 0 then Cset.CodePage.(of_int 0 |> to_letter) + else Cset.CodePage.(t.(color) |> of_int |> to_letter) + + let length = Array.length + end + + module Table = struct + type t = ((int * int) * color) array + + let print_one ppf ((c1, c2), color) = + if Int.equal c1 c2 then Format.fprintf ppf "%d (%d)" c1 color + else Format.fprintf ppf "%d-%d (%d)" c1 c2 color + + let pp ppf (t : t) = + Format.pp_print_list + ~pp_sep:(fun fmt () -> Format.fprintf fmt ",@ ") + print_one ppf (Array.to_list t) + + (* get the color value as a letter from a cp *) + let get_letter (t : t) c = + match binary_search cmp_interval (Cset.CodePage.to_int c) t with + | None -> Cset.CodePage.(of_int 0 |> to_letter) + | Some x -> Cset.CodePage.(snd t.(x) |> of_int |> to_letter) + + (* get the color value as a cp from a letter *) + let get (t : t) letter = + match + binary_search cmp_interval + Cset.CodePage.(from_letter letter |> to_int) + t + with + | None -> Cset.CodePage.of_int 0 + | Some x -> Cset.CodePage.(snd t.(x) |> of_int) + + let translate_colors (cm : t) cset = + Cset.fold_right cset ~init:Cset.empty ~f:(fun i j l -> + let start = get_letter cm i in + let stop = get_letter cm j in + Cset.union (Cset.cseq start stop) l) + end + + let len_cany = Cset.fold_left ~f:(fun i _ _ -> succ i) ~init:0 Cset.cany + + let flatten : t -> Table.t * Repr.t = + fun cm -> + if Array.length !cm = 0 then + ( Cset.fold_left Cset.cany ~init:[||] ~f:(fun acc i j -> + Array.append acc + [| ((Cset.CodePage.to_int i, Cset.CodePage.to_int j), 0) |]), + [| Cset.CodePage.(max_t |> to_int) |] ) + else + let dim = len_cany + Array.length !cm in + let repr = Array.make dim 0 in + let table = Array.make dim ((0, 0), 0) in + table.(0) <- ((0, max 0 @@ pred !cm.(0)), 0); + repr.(0) <- max 0 @@ pred !cm.(0); + let len = Array.length !cm in + let rec iter (a : int array) (color : color) i max = + if i >= max then + let cp = pred a.(pred i) in + let idx = ref i in + let max_cp = Cset.CodePage.(to_int max_t) in + let repr_len = + if Int.compare max_cp cp > 0 then ( + let cset = + Cset.diff Cset.cany + (Cset.cseq + Cset.CodePage.(0 |> of_int |> to_letter) + Cset.CodePage.(cp |> of_int |> to_letter)) + in + Cset.iter cset ~f:(fun cp1 cp2 -> + table.(!idx) <- (Cset.CodePage.(to_int cp1, to_int cp2), color); + repr.(i) <- Cset.CodePage.(to_int cp2); + incr idx); + i + 1) + else i + in + (Array.sub table 0 !idx, Array.sub repr 0 repr_len) + else ( + table.(i) <- ((a.(pred i), pred a.(i)), color); + repr.(i) <- pred a.(i); + iter a (succ color) (succ i) max) + in + iter !cm 1 1 len + + (* mark all the endpoints of the intervals of the alphabet. *) + let split : t -> Cset.t -> unit = + fun t cset -> + Cset.iter cset ~f:(fun i j -> + if not @@ Int.equal (Cset.CodePage.to_int i) 0 then + add (Cset.CodePage.to_int i) t; + add (Cset.CodePage.to_int j |> succ) t) +end + +module Utf8 = Make (Cset.Utf8) +module Utf16be = Make (Cset.Utf16be) +module Utf16le = Make (Cset.Utf16le) + +module Latin1 = struct + type cp = Cset.Latin1.cp + type color = int + type letter = Cset.Latin1.letter + type cset_t = Cset.Latin1.t + type t = Bytes.t + + module Repr = struct + type t = string + + let repr t color = + t.[color] + + let length = String.length + let pp ppf t = Format.fprintf ppf "%s" t + end + + module Table = struct + type t = string + + let get_letter t c = t.[Cset.Latin1.CodePage.to_int c] + + let get t c = + Cset.Latin1.CodePage.from_letter (String.unsafe_get t (Char.code c)) + + let translate_colors (cm : t) (cset : Cset.Latin1.t) = + Cset.Latin1.fold_right cset ~init:Cset.Latin1.empty ~f:(fun i j l -> + let start = get_letter cm i in + let stop = get_letter cm j in + Cset.Latin1.union (Cset.Latin1.cseq start stop) l) + + let pp ppf t = Format.fprintf ppf "%s" t + end + + let make () = Bytes.make 257 '\000' + let pp ppf t = Format.fprintf ppf "%s" (Bytes.unsafe_to_string t) + + let flatten cm = + + let c = Bytes.create 256 in + let color_repr = Bytes.create 256 in + let v = ref 0 in + Bytes.set c 0 '\000'; + Bytes.set color_repr 0 '\000'; + for i = 1 to 255 do + if Bytes.get cm i <> '\000' then incr v; + Bytes.set c i (Char.chr !v); + Bytes.set color_repr !v (Char.chr i) + done; + (Bytes.unsafe_to_string c, Bytes.sub_string color_repr 0 (!v + 1)) + + (* mark all the endpoints of the intervals of the char set with the 1 byte *) + let split t set = + Cset.Latin1.iter set ~f:(fun i j -> + Bytes.set t (Cset.Latin1.CodePage.to_int i) '\001'; + Bytes.set t (Cset.Latin1.CodePage.to_int j + 1) '\001') +end diff --git a/lib/unicode/color_map.mli b/lib/unicode/color_map.mli new file mode 100644 index 00000000..f65dc5fb --- /dev/null +++ b/lib/unicode/color_map.mli @@ -0,0 +1,67 @@ +(* Color maps exists to provide an optimization for the regex engine. The fact + that some characters are entirely equivalent for some regexes means that we + can use them interchangeably. + + A color map assigns a color to every character in our character set. Any two + characters with the same color will be treated equivalently by the automaton. +*) + +module type T = sig + type cp + type letter + type cset_t + type color = int + type t + + module Repr : sig + type t + + val repr : t -> color -> letter + val length : t -> int + val pp : Format.formatter -> t -> unit + end + + module Table : sig + type t + + val get : t -> letter -> cp + val get_letter : t -> cp -> letter + val translate_colors : t -> cset_t -> cset_t + val pp : Format.formatter -> t -> unit + end + + val make : unit -> t + val flatten : t -> Table.t * Repr.t + val split : t -> cset_t -> unit + val pp : Format.formatter -> t -> unit +end + +module Make (Cset : Cset.T) : + T + with type cp = Cset.cp + and type letter = Cset.letter + and type cset_t = Cset.t + +module Utf8 : + T + with type cp = Cset.Utf8.cp + and type letter = Cset.Utf8.letter + and type cset_t = Cset.Utf8.t + +module Utf16be : + T + with type cp = Cset.Utf16be.cp + and type letter = Cset.Utf16be.letter + and type cset_t = Cset.Utf16be.t + +module Utf16le : + T + with type cp = Cset.Utf16le.cp + and type letter = Cset.Utf16le.letter + and type cset_t = Cset.Utf16le.t + +module Latin1 : + T + with type cp = Cset.Latin1.cp + and type letter = Cset.Latin1.letter + and type cset_t = Cset.Latin1.t diff --git a/lib/unicode/compile.ml b/lib/unicode/compile.ml new file mode 100644 index 00000000..04191aee --- /dev/null +++ b/lib/unicode/compile.ml @@ -0,0 +1,945 @@ +open Import + +module type T = sig + type re + type ast + + module Stream : sig + type t + type 'a feed = Ok of 'a | No_match + + val create : re -> t + val feed : t -> string -> pos:int -> len:int -> t feed + val finalize : t -> string -> pos:int -> len:int -> bool + + module Group : sig + type stream := t + type t + + module Match : sig + type t + + val get : t -> int -> string option + val test_mark : t -> Pmark.t -> bool + end + + val create : stream -> t + val feed : t -> string -> pos:int -> len:int -> t feed + val finalize : t -> string -> pos:int -> len:int -> Match.t feed + val no_match_starts_before : t -> int + end + end + + type match_info = + | Match of Group.t + | Failed + | Running of { no_match_starts_before : int } + + val match_str_no_bounds : + groups:bool -> + partial:bool -> + re -> + string -> + pos:int -> + len:int -> + match_info + + val match_str : + groups:bool -> + partial:bool -> + re -> + string -> + pos:int -> + len:int -> + match_info + + val match_str_p : re -> string -> pos:int -> len:int -> bool + val compile : ast -> re + val group_count : re -> int + val group_names : re -> (string * int) list + val pp_re : re Fmt.t +end + +let rec iter n f v = if Int.equal n 0 then v else iter (n - 1) f (f v) + +module Idx : sig + type t [@@immediate] + + val unknown : t + val make_break : Automata.Idx.t -> t + val of_idx : Automata.Idx.t -> t + val is_idx : t -> bool + val is_break : t -> bool + val is_unknown : t -> bool + val idx : t -> int + val break_idx : t -> int +end = struct + type t = int + + let unknown = -2 + let break = -3 + let of_idx (x : Automata.Idx.t) = Automata.Idx.to_int x [@@inline always] + let is_idx t = t >= 0 [@@inline always] + let is_break x = x <= break [@@inline always] + let is_unknown x = x = unknown [@@inline always] + let idx t = t [@@inline always] + + let make_break (idx : Automata.Idx.t) = -5 - Automata.Idx.to_int idx + [@@inline always] + + let break_idx t = (t + 5) * -1 [@@inline always] +end + +module Make + (Cset : Cset.T) + (Color_map : + Color_map.T + with type cp = Cset.cp + and type letter = Cset.letter + and type cset_t = Cset.t) = +struct + module Ast = struct + include Ast + include Ast.Make (Cset) (Color_map) + end + + module Automata = struct + include Automata + include Automata.Make (Cset) + end + + module Category = struct + include Category + include Category.Make (Cset) + end + + type ast = Ast.t + + type state_info = { + idx : Idx.t; + (* Index of the current position in the position table. + Not yet computed transitions point to a dummy state where + [idx] is set to [unknown]; + If [idx] is set to [break] for states that either always + succeed or always fail. *) + mutable final : (Category.t * (Automata.Idx.t * Automata.Status.t)) list; + (* Mapping from the category of the next character to + - the index where the next position should be saved + - possibly, the list of marks (and the corresponding indices) + corresponding to the best match *) + desc : Automata.State.t; (* Description of this state of the automata *) + } + + (* Thread-safety: we use double-checked locking to access field [final]. *) + + (* A state [t] is a pair composed of some information about the + state [state_info] and a transition table [t array], indexed by + color. For performance reason, to avoid an indirection, we manually + unbox the transition table: we allocate a single array, with the + state information at index 0, followed by the transitions. *) + module State : sig + type t + + val make : ncol:int -> state_info -> t + val make_break : state_info -> t + val get_info : t -> state_info + val follow_transition : t -> color:Cset.cp -> t + val set_transition : t -> color:Cset.cp -> t -> unit + val is_unknown_transition : t -> color:Cset.cp -> bool + end = struct + type t = Table of t array [@@unboxed] + + (* Thread-safety: + We store the state information at index 0. For other elements + of the transition table, which are lazily computed, we use + double-checked locking. *) + + let get_info (Table st) : state_info = Obj.magic (Array.unsafe_get st 0) + [@@inline always] + + let set_info (Table st) (info : state_info) = st.(0) <- Obj.magic info + + let follow_transition (Table st) ~color = + Array.unsafe_get st (1 + Cset.CodePage.to_int color) + [@@inline always] + + let set_transition (Table st) ~color st' = + st.(1 + Cset.CodePage.to_int color) <- st' + + let is_unknown_transition st ~color = + let st' = follow_transition st ~color in + let info = get_info st' in + Idx.is_unknown info.idx + + let dummy (info : state_info) = Table [| Obj.magic info |] + + let unknown_state = + dummy { idx = Idx.unknown; final = []; desc = Automata.State.dummy } + + let make ~ncol state = + let st = Table (Array.make (ncol + 1) unknown_state) in + set_info st state; + st + + let make_break state = Table [| Obj.magic state |] + end + + (* Automata (compiled regular expression) *) + type re = { + initial : Automata.expr; + (* The whole regular expression *) + mutable initial_states : (Category.t * State.t) list; + (* Initial states, indexed by initial category *) + colors : Color_map.Table.t; + (* Color table *) + color_repr : Color_map.Repr.t; + (* Table from colors to one character of this color *) + ncolor : int; + (* Number of colors. *) + lnl : Cset.cp; + (* Color of the last newline. [Cset.null_char] if unnecessary *) + tbl : Automata.Working_area.t; + (* Temporary table used to compute the first available index + when computing a new state *) + states : State.t Automata.State.Table.t; + (* States of the deterministic automata *) + group_names : (string * int) list; + (* Named groups in the regular expression *) + group_count : int; + (* Number of groups in the regular expression *) + mutex : Mutex.t; + } + + (* Thread-safety: + We use double-checked locking to access field [initial_states]. The + state table [states] and the working area [tbl] are only accessed + with the mutex [mutex] locked. + The working area is shared between all threads. This might be + inefficient if many threads are updating the automaton. It seems + complicated to manage a working area per domain and per regular + expression. So, if this becomes an issue, it might just be simpler + to allocate a fresh working area whenever needed. +*) + + let pp_re ch re = Automata.pp ch re.initial + let group_count re = re.group_count + let group_names re = re.group_names + + module Positions = struct + (* Information used during matching *) + type t = { + mutable positions : int array; + (* Array of mark positions + The mark are off by one for performance reasons *) + mutable length : int; + } + + let empty = { positions = [||]; length = 0 } + let length t = t.length + let unsafe_set t idx pos = Array.unsafe_set t.positions idx pos + + let rec resize idx t = + t.length <- 2 * t.length; + if idx >= t.length then resize idx t + else + let pos = t.positions in + t.positions <- Array.make t.length 0; + Array.blit pos 0 t.positions 0 (Array.length pos) + + let set t idx pos = + if idx >= length t then resize idx t; + unsafe_set t idx pos + + let all t = t.positions + let first t = t.positions.(0) + + let make ~groups re = + if groups then + (* We initialize this table with a reasonable size. The required + size may change when the automaton gets updated. So we are + always checking whether it is large enough before modifying it. *) + let length = Automata.Working_area.index_count re.tbl + 1 in + { positions = Array.make length 0; length } + else empty + end + + (****) + + let category re ~color = + if Cset.CodePage.equal color Cset.CodePage.null then Category.inexistant + (* Special category for the last newline *) + else if Cset.CodePage.equal color re.lnl then + Category.(lastnewline ++ newline ++ not_letter) + else + Category.from_letter + @@ Color_map.Repr.repr re.color_repr (Cset.CodePage.to_int color) + + (****) + + let find_state re desc = + try Automata.State.Table.find re.states desc + with Not_found -> + let st = + let break_state = + match Automata.State.status_no_mutex desc with + | Running -> false + | Failed | Match _ -> true + in + let st = + { + idx = + (let idx = Automata.State.idx desc in + if break_state then Idx.make_break idx else Idx.of_idx idx); + final = []; + desc; + } + in + if break_state then State.make_break st + else State.make ~ncol:re.ncolor st + in + st + + (**** Match with marks ****) + + let delta re cat ~color st = + Automata.delta re.tbl cat color st.desc + + let validate re (s : string) ~pos st = + let letter = + Cset.Codec.Unsafe.unsafe_bytes s pos |> Cset.Codec.from_bytes + in + let color = Color_map.Table.get re.colors letter in + Mutex.lock re.mutex; + (if State.is_unknown_transition st ~color then + let st' = + let desc' = + let cat = category re ~color in + delta re cat ~color (State.get_info st) + in + find_state re desc' + in + State.set_transition st ~color st'); + Mutex.unlock re.mutex + + let next colors st s pos = + State.follow_transition st + ~color: + (Cset.Codec.Unsafe.unsafe_bytes s pos + |> Cset.Codec.from_bytes |> Color_map.Table.get colors) + + let rec loop re ~colors ~positions s ~pos ~last st0 st = + if pos < last then + let st' = next colors st s pos in + let idx = (State.get_info st').idx in + if Idx.is_idx idx then + if + Idx.idx idx < Positions.length positions + then ( + Positions.unsafe_set positions (Idx.idx idx) pos; + let w = Cset.Codec.width String.unsafe_get s pos in + loop re ~colors ~positions s ~pos:(pos + w) ~last st' st') + else ( + Positions.set positions (Idx.idx idx) pos; + let w = Cset.Codec.width String.unsafe_get s pos in + loop re ~colors ~positions s ~pos:(pos + w) ~last st' st') + else if Idx.is_break idx then ( + Positions.set positions (Idx.break_idx idx) pos; + st') + else ( + (* Unknown *) + validate re s ~pos st0; + loop re ~colors ~positions s ~pos ~last st0 st0) + else + st + + let rec loop_no_mark re ~colors s ~pos ~last st0 st = + if pos < last then + let st' = next colors st s pos in + let idx = (State.get_info st').idx in + if Idx.is_idx idx then + let w = Cset.Codec.width String.unsafe_get s pos in + loop_no_mark re ~colors s ~pos:(pos + w) ~last st' st' + else if Idx.is_break idx then + st' + else ( + (* Unknown *) + validate re s ~pos st0; + loop_no_mark re ~colors s ~pos ~last st0 st0) + else st + + let final re st cat = + try List.assq cat st.final + with Not_found -> + Mutex.lock re.mutex; + let res = + try List.assq cat st.final + with Not_found -> + let st' = delta re cat ~color:Cset.CodePage.null st in + let res = + (Automata.State.idx st', Automata.State.status_no_mutex st') + in + st.final <- (cat, res) :: st.final; + res + in + Mutex.unlock re.mutex; + res + + let find_initial_state re cat = + try List.assq cat re.initial_states + with Not_found -> + Mutex.lock re.mutex; + let res = + try List.assq cat re.initial_states + with Not_found -> + let st = find_state re (Automata.State.create cat re.initial) in + re.initial_states <- (cat, st) :: re.initial_states; + st + in + Mutex.unlock re.mutex; + res + + let get_color re (s : string) pos = + if pos < 0 then Cset.CodePage.null + else + let slen = String.length s in + if pos >= slen then Cset.CodePage.null + else if + pos = slen - 1 + && (not (Cset.CodePage.equal re.lnl Cset.CodePage.null)) + && Cset.CodePage.equal + (Cset.Codec.Unsafe.unsafe_bytes_rev s (slen - 1) + |> Cset.Codec.from_bytes |> Cset.CodePage.from_letter) + (Cset.Codec.new_line |> Cset.CodePage.from_letter) + then + (* Special case for the last newline *) + re.lnl + else + let letter = + Cset.Codec.Unsafe.unsafe_bytes s pos |> Cset.Codec.from_bytes + in + Color_map.Table.get re.colors letter + + let rec handle_last_newline re positions ~pos st ~groups = + let st' = State.follow_transition st ~color:re.lnl in + let info = State.get_info st' in + if Idx.is_idx info.idx then ( + if groups then Positions.set positions (Idx.idx info.idx) pos; + st') + else if Idx.is_break info.idx then ( + if groups then Positions.set positions (Idx.break_idx info.idx) pos; + st') + else + (* Unknown *) + let color = re.lnl in + Mutex.lock re.mutex; + (if State.is_unknown_transition st ~color then + let st' = + let desc = + let cat = category re ~color in + let real_c = + Color_map.Table.get re.colors Cset.Codec.new_line + (* '\n' *) + in + delta re cat ~color:real_c (State.get_info st) + in + find_state re desc + in + State.set_transition st ~color st'); + Mutex.unlock re.mutex; + handle_last_newline re positions ~pos st ~groups + + let rec scan_str re positions (s : string) initial_state ~last ~pos ~groups = + if + last = String.length s + && (not (Cset.CodePage.equal re.lnl Cset.CodePage.null)) + && last > pos + && + try + Cset.CodePage.equal + (Cset.Codec.Unsafe.unsafe_bytes_rev s (last - 1) + |> Cset.Codec.from_bytes |> Cset.CodePage.from_letter) + (Cset.Codec.new_line |> Cset.CodePage.from_letter) + with _ -> false + then + let w = Cset.Codec.width_rev String.unsafe_get s (last - 1) in + let last = last - w in + let st = scan_str re positions ~pos s initial_state ~last ~groups in + if Idx.is_break (State.get_info st).idx then st + else handle_last_newline re positions ~pos:last st ~groups + else if groups then + loop re ~colors:re.colors ~positions s ~pos ~last initial_state + initial_state + else + loop_no_mark re ~colors:re.colors s ~pos ~last initial_state initial_state + + (* This function adds a final boundary check on the input. + This is useful to indicate that the output failed because + of insufficient input, or to verify that the output actually + matches for regex that have boundary conditions with respect + to the input string. +*) + let final_boundary_check re positions ~last ~slen s state_info ~groups = + let idx, res = + let final_cat = + Category.( + search_boundary + ++ + if last = slen then inexistant + else category re ~color:(get_color re s last)) + in + final re state_info final_cat + in + (match (groups, res) with + | true, Match _ -> Positions.set positions (Automata.Idx.to_int idx) last + | _ -> ()); + res + + let make_match_str re positions ~len ~groups ~partial s ~pos = + let slen = String.length s in + let last = if len = -1 then slen else pos + len in + let st = + let initial_state = + let initial_cat = + Category.( + search_boundary + ++ + if pos = 0 then inexistant + else + let w = Cset.Codec.width_rev String.unsafe_get s (pos - 1) in + category re ~color:(get_color re s (pos - w))) + in + let st = find_initial_state re initial_cat in + st + in + scan_str re positions s initial_state ~pos ~last ~groups + in + let state_info = State.get_info st in + if Idx.is_break state_info.idx || (partial && not groups) then + Automata.State.status re.mutex state_info.desc + else if partial && groups then + match Automata.State.status re.mutex state_info.desc with + | (Match _ | Failed) as status -> + status + | Running -> ( + (* This could be because it's still not fully matched, or it + could be that because we need to run special end of input + checks. *) + match + final_boundary_check re positions ~last ~slen s state_info ~groups + with + | Match _ as status -> + status + | Failed | Running -> + (* A failure here just means that we need more data, i.e. + it's a partial match. *) + Running) + else + final_boundary_check re positions ~last ~slen s state_info ~groups + + module Stream = struct + type nonrec t = { state : State.t; re : re } + type 'a feed = Ok of 'a | No_match + + let create re = + let category = Category.(search_boundary ++ inexistant) in + let state = find_initial_state re category in + { state; re } + + let feed t s ~pos ~len = + (* TODO bound checks? *) + let last = pos + len in + let state = + loop_no_mark t.re ~colors:t.re.colors s ~last ~pos t.state t.state + in + let info = State.get_info state in + if + Idx.is_break info.idx + && + match Automata.State.status t.re.mutex info.desc with + | Failed -> true + | Match _ | Running -> false + then No_match + else Ok { t with state } + + let finalize t s ~pos ~len = + (* TODO bound checks? *) + let last = pos + len in + let state = + scan_str t.re Positions.empty s t.state ~last ~pos ~groups:false + in + let info = State.get_info state in + match + let _idx, res = + let final_cat = Category.(search_boundary ++ inexistant) in + final t.re info final_cat + in + res + with + | Running | Failed -> false + | Match _ -> true + + module Group = struct + type nonrec t = { + t : t; + positions : Positions.t; + slices : Slice.L.t; + abs_pos : int; + first_match_pos : int; + } + + let no_match_starts_before t = t.first_match_pos + + let create t = + { + t; + positions = Positions.make ~groups:true t.re; + slices = []; + abs_pos = 0; + first_match_pos = 0; + } + + module Match = struct + type t = { + pmarks : Pmark.Set.t; + slices : Slice.L.t; + marks : Mark_infos.t; + positions : int array; + start_pos : int; + } + + let test_mark t mark = Pmark.Set.mem mark t.pmarks + + let get t i = + Mark_infos.offset t.marks i + |> Option.map (fun (start, stop) -> + let start = t.positions.(start) - t.start_pos in + let stop = t.positions.(stop) - t.start_pos in + Slice.L.get_substring t.slices ~start ~stop) + + let make ~start_pos ~pmarks ~slices ~marks ~positions = + let positions = Positions.all positions in + { pmarks; slices; positions; marks; start_pos } + end + + let rec loop re ~abs_pos ~colors ~positions s ~pos ~last st0 st = + if pos < last then + let st' = next colors st s pos in + let idx = (State.get_info st').idx in + if Idx.is_idx idx then + if Idx.idx idx < Positions.length positions then ( + Positions.unsafe_set positions (Idx.idx idx) (abs_pos + pos); + let w = Cset.Codec.width String.unsafe_get s pos in + loop re ~abs_pos ~colors ~positions s ~pos:(pos + w) ~last st' st') + else ( + (* Resize position array *) + Positions.set positions (Idx.idx idx) (abs_pos + pos); + let w = Cset.Codec.width String.unsafe_get s pos in + loop re ~abs_pos ~colors ~positions s ~pos:(pos + w) ~last st' st') + else if Idx.is_break idx then ( + Positions.set positions (Idx.break_idx idx) (abs_pos + pos); + st') + else ( + (* Unknown *) + validate re s ~pos st0; + loop re ~abs_pos ~colors ~positions s ~pos ~last st0 st0) + else st + + let feed ({ t; positions; slices; abs_pos; first_match_pos = _ } as tt) s + ~pos ~len = + let state = + (* TODO bound checks? *) + let last = pos + len in + loop t.re ~abs_pos ~colors:t.re.colors s ~positions ~last ~pos t.state + t.state + in + let info = State.get_info state in + if + Idx.is_break info.idx + && + match Automata.State.status t.re.mutex info.desc with + | Failed -> true + | Match _ | Running -> false + then No_match + else + let t = { t with state } in + let slices = { Slice.s; pos; len } :: slices in + let first_match_pos = Positions.first positions in + let slices = + Slice.L.drop_rev slices (first_match_pos - tt.first_match_pos) + in + let abs_pos = abs_pos + len in + Ok { tt with t; slices; abs_pos; first_match_pos } + + let finalize + ({ t; positions; slices; abs_pos; first_match_pos = _ } as tt) s ~pos + ~len : Match.t feed = + (* TODO bound checks? *) + let last = pos + len in + let info = + let state = + loop t.re ~abs_pos ~colors:t.re.colors s ~positions ~last ~pos + t.state t.state + in + State.get_info state + in + match + match Automata.State.status t.re.mutex info.desc with + | (Match _ | Failed) as s -> s + | Running -> + let idx, res = + let final_cat = Category.(search_boundary ++ inexistant) in + final t.re info final_cat + in + (match res with + | Running | Failed -> () + | Match _ -> + Positions.set positions (Automata.Idx.to_int idx) (abs_pos + last)); + res + with + | Running | Failed -> No_match + | Match (marks, pmarks) -> + let first_match_position = Positions.first positions in + let slices = + let slices = + let slices = { Slice.s; pos; len } :: slices in + Slice.L.drop_rev slices (first_match_position - tt.first_match_pos) + in + List.rev slices + in + Ok + (Match.make ~start_pos:first_match_position ~pmarks ~marks ~slices + ~positions) + end + end + + type match_info = + | Match of Group.t + | Failed + | Running of { no_match_starts_before : int } + + let match_str_no_bounds ~groups ~partial re s ~pos ~len = + let positions = Positions.make ~groups re in + match make_match_str re positions ~len ~groups ~partial s ~pos with + | Match (marks, pmarks) -> + Match + (Group.create s marks pmarks ~gpos:(Positions.all positions) + ~gcount:re.group_count) + | Failed -> Failed + | Running -> + let no_match_starts_before = + if groups then Positions.first positions else 0 + in + Running { no_match_starts_before } + + let match_str_p re s ~pos ~len = + if pos < 0 || len < -1 || pos + len > String.length s then + raise (Invalid_argument "Re.exec: out of bounds"); + match + make_match_str re Positions.empty ~len ~groups:false ~partial:false s ~pos + with + | Match _ -> true + | _ -> false + + let match_str ~groups ~partial re s ~pos ~len = + if pos < 0 || len < -1 || pos + len > String.length s then + invalid_arg "Re.exec: out of bounds"; + match_str_no_bounds ~groups ~partial re s ~pos ~len + + let mk_re ~initial ~colors ~color_repr ~ncolor ~lnl ~group_names ~group_count + = + { + initial; + initial_states = []; + colors; + color_repr; + ncolor; + lnl; + tbl = Automata.Working_area.create (); + states = Automata.State.Table.create 97; + group_names; + group_count; + mutex = Mutex.create (); + } + + (**** Compilation ****) + + (* module A = Automata *) + + let enforce_kind ids kind kind' cr = + match (kind, kind') with + | `First, `First -> cr + | `First, k -> Automata.seq ids k cr (Automata.eps ids) + | _ -> cr + + type context = { + ids : Automata.Ids.t; + kind : Automata.Sem.t; + ign_group : bool; + greedy : Automata.Rep_kind.t; + pos : Automata.Mark.t ref; + names : (string * int) list ref; + cache : Cset.t Cset.CSetMap.t ref; + colors : Color_map.Table.t; + } + + let trans_set cache (cm : Color_map.Table.t) s = + match Cset.one_c s with + | Some i -> Cset.csingle (Color_map.Table.get_letter cm i) + | None -> ( + let v = (Cset.hash s, s) in + try Cset.CSetMap.find v !cache + with Not_found -> + let l = Color_map.Table.translate_colors cm s in + cache := Cset.CSetMap.add v l !cache; + l) + + let make_repeater ids cr kind greedy = + match greedy with + | `Greedy -> + fun rem -> + Automata.alt ids + [ + Automata.seq ids kind (Automata.rename ids cr) rem; Automata.eps ids; + ] + | `Non_greedy -> + fun rem -> + Automata.alt ids + [ + Automata.eps ids; Automata.seq ids kind (Automata.rename ids cr) rem; + ] + + (* XXX should probably compute a category mask *) + let rec translate + ({ ids; kind; ign_group; greedy; pos; names; cache; colors } as ctx) + (ast : Ast.no_case) = + match ast with + | Set s -> (Automata.cst ids (trans_set cache colors s), kind) + | Sequence l -> (trans_seq ctx l, kind) + | Ast (Alternative l) -> ( + match Ast.merge_sequences l with + | [ r' ] -> + let cr, kind' = translate ctx r' in + (enforce_kind ids kind kind' cr, kind) + | merged_sequences -> + ( Automata.alt ids + (List.map merged_sequences ~f:(fun r' -> + let cr, kind' = translate ctx r' in + enforce_kind ids kind kind' cr)), + kind )) + | Repeat (r', i, j) -> + let cr, kind' = translate ctx r' in + let rem = + match j with + | None -> Automata.rep ids greedy kind' cr + | Some j -> + let f = make_repeater ids cr kind' greedy in + iter (j - i) f (Automata.eps ids) + in + ( iter i + (fun rem -> Automata.seq ids kind' (Automata.rename ids cr) rem) + rem, + kind ) + | Beg_of_line -> (Automata.after ids Category.(inexistant ++ newline), kind) + | End_of_line -> (Automata.before ids Category.(inexistant ++ newline), kind) + | Beg_of_word -> + ( Automata.seq ids `First + (Automata.after ids Category.(inexistant ++ not_letter)) + (Automata.before ids Category.letter), + kind ) + | End_of_word -> + ( Automata.seq ids `First + (Automata.after ids Category.letter) + (Automata.before ids Category.(inexistant ++ not_letter)), + kind ) + | Not_bound -> + ( Automata.alt ids + [ + Automata.seq ids `First + (Automata.after ids Category.letter) + (Automata.before ids Category.letter); + (let cat = Category.(inexistant ++ not_letter) in + Automata.seq ids `First (Automata.after ids cat) + (Automata.before ids cat)); + ], + kind ) + | Beg_of_str -> (Automata.after ids Category.inexistant, kind) + | End_of_str -> (Automata.before ids Category.inexistant, kind) + | Last_end_of_line -> + (Automata.before ids Category.(inexistant ++ lastnewline), kind) + | Start -> (Automata.after ids Category.search_boundary, kind) + | Stop -> (Automata.before ids Category.search_boundary, kind) + | Sem (kind', r') -> + let cr, kind'' = translate { ctx with kind = kind' } r' in + (enforce_kind ids kind' kind'' cr, kind') + | Sem_greedy (greedy', r') -> translate { ctx with greedy = greedy' } r' + | Group (n, r') -> + if ign_group then translate ctx r' + else + let p = !pos in + let () = + match n with + | Some name -> names := (name, Automata.Mark.group_count p) :: !names + | None -> () + in + pos := Automata.Mark.next2 !pos; + let cr, kind' = translate ctx r' in + ( Automata.seq ids `First (Automata.mark ids p) + (Automata.seq ids `First cr + (Automata.mark ids (Automata.Mark.next p))), + kind' ) + | No_group r' -> translate { ctx with ign_group = true } r' + | Nest r' -> + let b = !pos in + let cr, kind' = translate ctx r' in + let e = Automata.Mark.prev !pos in + if Automata.Mark.compare e b = -1 then (cr, kind') + else (Automata.seq ids `First (Automata.erase ids b e) cr, kind') + | Pmark (i, r') -> + let cr, kind' = translate ctx r' in + (Automata.seq ids `First (Automata.pmark ids i) cr, kind') + + and trans_seq ({ ids; kind; _ } as ctx) = function + | [] -> Automata.eps ids + | [ r ] -> + let cr', kind' = translate ctx r in + enforce_kind ids kind kind' cr' + | r :: rem -> + let cr', kind' = translate ctx r in + let cr'' = trans_seq ctx rem in + if Automata.is_eps cr'' then cr' + else if Automata.is_eps cr' then cr'' + else Automata.seq ids kind' cr' cr'' + + let compile_1 regexp = + let regexp = Ast.handle_case false regexp in + let color_map = Color_map.make () in + let need_lnl = Ast.colorize color_map regexp in + let colors, color_repr = Color_map.flatten color_map in + let ncolor = Color_map.Repr.length color_repr in + let lnl = + if need_lnl then Cset.CodePage.of_int ncolor else Cset.CodePage.null + in + let ncolor = if need_lnl then ncolor + 1 else ncolor in + let ctx = + { + ids = Automata.Ids.create (); + kind = `First; + ign_group = false; + greedy = `Greedy; + pos = ref Automata.Mark.start; + names = ref []; + cache = ref Cset.CSetMap.empty; + colors; + } + in + let r, kind = translate ctx regexp in + let r = enforce_kind ctx.ids `First kind r in + mk_re ~initial:r ~colors ~color_repr ~ncolor ~lnl + ~group_names:(List.rev !(ctx.names)) + ~group_count:(Automata.Mark.group_count !(ctx.pos)) + + let compile r = + let open Ast.Export in + compile_1 + (if Ast.anchored r then group r else seq [ shortest (rep any); group r ]) +end diff --git a/lib/unicode/compile.mli b/lib/unicode/compile.mli new file mode 100644 index 00000000..ea77fe77 --- /dev/null +++ b/lib/unicode/compile.mli @@ -0,0 +1,68 @@ +module type T = sig + type re + type ast + + module Stream : sig + type t + type 'a feed = Ok of 'a | No_match + + val create : re -> t + val feed : t -> string -> pos:int -> len:int -> t feed + val finalize : t -> string -> pos:int -> len:int -> bool + + module Group : sig + type stream := t + type t + + module Match : sig + type t + + val get : t -> int -> string option + val test_mark : t -> Pmark.t -> bool + end + + val create : stream -> t + val feed : t -> string -> pos:int -> len:int -> t feed + val finalize : t -> string -> pos:int -> len:int -> Match.t feed + val no_match_starts_before : t -> int + end + end + + type match_info = + | Match of Group.t + | Failed + | Running of { no_match_starts_before : int } + + val match_str_no_bounds : + groups:bool -> + partial:bool -> + re -> + string -> + pos:int -> + len:int -> + match_info + + val match_str : + groups:bool -> + partial:bool -> + re -> + string -> + pos:int -> + len:int -> + match_info + + val match_str_p : re -> string -> pos:int -> len:int -> bool + val compile : ast -> re + val group_count : re -> int + val group_names : re -> (string * int) list + val pp_re : re Fmt.t +end + +module Make + (Cset : Cset.T) + (Color_map : + Color_map.T + with type cp = Cset.cp + and type letter = Cset.letter + and type cset_t = Cset.t) : + T with type ast = Ast.Make(Cset)(Color_map).t diff --git a/lib/unicode/core.ml b/lib/unicode/core.ml new file mode 100644 index 00000000..9283e050 --- /dev/null +++ b/lib/unicode/core.ml @@ -0,0 +1,989 @@ +(* + RE - A regular expression library + + Copyright (C) 2001 Jerome Vouillon + email: Jerome.Vouillon@pps.jussieu.fr + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation, with + linking exception; either version 2.1 of the License, or (at + your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +*) + +open Import + +module type T = sig + type letter + + (** Regular expression *) + type t + + (** Compiled regular expression *) + type re + + (** Manipulate matching groups. *) + module Group : sig + (** Information about groups in a match. As is conventional, every match + implicitly has a group 0 that covers the whole match, and explicit + groups are numbered from 1. *) + type t = Group.t + + (** Raise [Not_found] if the group did not match *) + val get : t -> int -> string + + (** Similar to {!get}, but returns an option instead of using an exception. + *) + val get_opt : t -> int -> string option + + (** Raise [Not_found] if the group did not match *) + val offset : t -> int -> int * int + + (** Similar to {!offset}, but returns an option instead of using an + exception. *) + val offset_opt : t -> int -> (int * int) option + + (** Return the start of the match. Raise [Not_found] if the group did not + match. *) + val start : t -> int -> int + + (** Similar to {!start_opt}, but returns an option instead of using an + exception. *) + val start_opt : t -> int -> int option + + (** Return the end of the match. Raise [Not_found] if the group did not + match. *) + val stop : t -> int -> int + + (** Similar to {!stop_opt}, but returns an option instead of using an + exception. *) + val stop_opt : t -> int -> int option + + (** Return the empty string for each group which did not match *) + val all : t -> string array + + (** Return [(-1,-1)] for each group which did not match *) + val all_offset : t -> (int * int) array + + (** Test whether a group matched *) + val test : t -> int -> bool + + (** Returns the total number of groups defined - matched or not. This + function is experimental. *) + val nb_groups : t -> int + + val pp : Format.formatter -> t -> unit + end + + type groups = Group.t [@@ocaml.deprecated "Use Group.t"] + + (** {2 Compilation and execution of a regular expression} *) + + (** Compile a regular expression into an executable version that can be used + to match strings, e.g. with {!exec}. *) + val compile : t -> re + + (** Return the number of capture groups (including the one corresponding to + the entire regexp). *) + val group_count : re -> int + + (** Return named capture groups with their index. *) + val group_names : re -> (string * int) list + + (** [exec re str] searches [str] for a match of the compiled expression [re], + and returns the matched groups if any. + + More specifically, when a match exists, [exec] returns a match that starts + at the earliest position possible. If multiple such matches are possible, + the one specified by the match semantics described below is returned. + + {5 Examples:} + {[ + # let regex = Re.compile Re.(seq [str "//"; rep print ]);; + val regex : re = + + # Re.exec regex "// a C comment";; + - : Re.Group.t = + + # Re.exec regex "# a C comment?";; + Exception: Not_found + + # Re.exec ~pos:1 regex "// a C comment";; + Exception: Not_found + ]} + + @param pos optional beginning of the string (default 0) + @param len + length of the substring of [str] that can be matched (default [-1], + meaning to the end of the string) + @raise Not_found if the regular expression can't be found in [str] *) + val exec : + ?pos:int (** Default: 0 *) -> + ?len:int (** Default: -1 (until end of string) *) -> + re -> + string -> + Group.t + + (** Similar to {!exec}, but returns an option instead of using an exception. + + {5 Examples:} + {[ + # let regex = Re.compile Re.(seq [str "//"; rep print ]);; + val regex : re = + + # Re.exec_opt regex "// a C comment";; + - : Re.Group.t option = Some + + # Re.exec_opt regex "# a C comment?";; + - : Re.Group.t option = None + + # Re.exec_opt ~pos:1 regex "// a C comment";; + - : Re.Group.t option = None + ]} *) + val exec_opt : + ?pos:int (** Default: 0 *) -> + ?len:int (** Default: -1 (until end of string) *) -> + re -> + string -> + Group.t option + + (** Similar to {!exec}, but returns [true] if the expression matches, and + [false] if it doesn't. This function is more efficient than calling + {!exec} or {!exec_opt} and ignoring the returned group. + + {5 Examples:} + {[ + # let regex = Re.compile Re.(seq [str "//"; rep print ]);; + val regex : re = + + # Re.execp regex "// a C comment";; + - : bool = true + + # Re.execp ~pos:1 regex "// a C comment";; + - : bool = false + ]} *) + val execp : + ?pos:int (** Default: 0 *) -> + ?len:int (** Default: -1 (until end of string) *) -> + re -> + string -> + bool + + (** More detailed version of {!execp}. [`Full] is equivalent to [true], while + [`Mismatch] and [`Partial] are equivalent to [false], but [`Partial] + indicates the input string could be extended to create a match. + + {5 Examples:} + {[ + # let regex = Re.compile Re.(seq [bos; str "// a C comment"]);; + val regex : re = + + # Re.exec_partial regex "// a C comment here.";; + - : [ `Full | `Mismatch | `Partial ] = `Full + + # Re.exec_partial regex "// a C comment";; + - : [ `Full | `Mismatch | `Partial ] = `Partial + + # Re.exec_partial regex "//";; + - : [ `Full | `Mismatch | `Partial ] = `Partial + + # Re.exec_partial regex "# a C comment?";; + - : [ `Full | `Mismatch | `Partial ] = `Mismatch + ]} *) + val exec_partial : + ?pos:int (** Default: 0 *) -> + ?len:int (** Default: -1 (until end of string) *) -> + re -> + string -> + [ `Full | `Partial | `Mismatch ] + + (** More detailed version of {!exec_opt}. [`Full group] is equivalent to + [Some group], while [`Mismatch] and [`Partial _] are equivalent to [None], + but [`Partial position] indicates that the input string could be extended + to create a match, and no match could start in the input string before the + given position. This could be used to not have to search the entirety of + the input if more becomes available, and use the given position as the + [?pos] argument. *) + val exec_partial_detailed : + ?pos:int (** Default: 0 *) -> + ?len:int (** Default: -1 (until end of string) *) -> + re -> + string -> + [ `Full of Group.t | `Partial of int | `Mismatch ] + + (** Marks *) + module Mark : sig + (** Mark id *) + type t = Pmark.t + + (** Tell if a mark was matched. *) + val test : Group.t -> t -> bool + + module Set : Set.S with type elt = t + + (** Return all the mark matched. *) + val all : Group.t -> Set.t + + val equal : t -> t -> bool + val compare : t -> t -> int + end + + (** {2 High Level Operations} *) + + type split_token = + [ `Text of string (** Text between delimiters *) + | `Delim of Group.t (** Delimiter *) ] + + (** Repeatedly calls {!exec} on the given string, starting at given position + and length. + + {5 Examples:} + {[ + # let regex = Re.compile Re.(seq [str "my"; blank; word(rep alpha)]);; + val regex : re = + + # Re.all regex "my head, my shoulders, my knees, my toes ...";; + - : Re.Group.t list = [; ; ; ] + + # Re.all regex "My head, My shoulders, My knees, My toes ...";; + - : Re.Group.t list = [] + ]} *) + val all : ?pos:int -> ?len:int -> re -> string -> Group.t list + + type 'a gen = unit -> 'a option + + (** @deprecated Use {!module-Seq.all} instead. *) + val all_gen : ?pos:int -> ?len:int -> re -> string -> Group.t gen + [@@ocaml.deprecated "Use Seq.all"] + + (** @deprecated Use {!module-Seq.all} instead. *) + val all_seq : ?pos:int -> ?len:int -> re -> string -> Group.t Seq.t + [@@ocaml.deprecated "Use Seq.all"] + + (** Same as {!all}, but extracts the matched substring rather than returning + the whole group. This basically iterates over matched strings. + + {5 Examples:} + {[ + # let regex = Re.compile Re.(seq [str "my"; blank; word(rep alpha)]);; + val regex : re = + + # Re.matches regex "my head, my shoulders, my knees, my toes ...";; + - : string list = ["my head"; "my shoulders"; "my knees"; "my toes"] + + # Re.matches regex "My head, My shoulders, My knees, My toes ...";; + - : string list = [] + + # Re.matches regex "my my my my head my 1 toe my ...";; + - : string list = ["my my"; "my my"] + + # Re.matches ~pos:2 regex "my my my my head my +1 toe my ...";; + - : string list = ["my my"; "my head"] + ]} *) + val matches : ?pos:int -> ?len:int -> re -> string -> string list + + (** @deprecated Use {!module-Seq.matches} instead. *) + val matches_gen : ?pos:int -> ?len:int -> re -> string -> string gen + [@@ocaml.deprecated "Use Seq.matches"] + + (** @deprecated Use {!module-Seq.matches} instead. *) + val matches_seq : ?pos:int -> ?len:int -> re -> string -> string Seq.t + [@@ocaml.deprecated "Use Seq.matches"] + + (** [split re s] splits [s] into chunks separated by [re]. It yields the + chunks themselves, not the separator. An occurence of the separator at the + beginning or the end of the string is ignoring. + + {5 Examples:} + {[ + # let regex = Re.compile (Re.char ',');; + val regex : re = + + # Re.split regex "Re,Ocaml,Jerome Vouillon";; + - : string list = ["Re"; "Ocaml"; "Jerome Vouillon"] + + # Re.split regex "No commas in this sentence.";; + - : string list = ["No commas in this sentence."] + + # Re.split regex ",1,2,";; + - : string list = ["1"; "2"] + + # Re.split ~pos:3 regex "1,2,3,4. Commas go brrr.";; + - : string list = ["3"; "4. Commas go brrr."] + ]} + + {6 Zero-length patterns:} + + Be careful when using [split] with zero-length patterns like [eol], [bow], + and [eow]. Because they don't have any width, they will still be present + in the result. (Note the position of the [\n] and space characters in the + output.) + + {[ + # Re.split (Re.compile Re.eol) "a\nb";; + - : string list = ["a"; "\nb"] + + # Re.split (Re.compile Re.bow) "a b";; + - : string list = ["a "; "b"] + + # Re.split (Re.compile Re.eow) "a b";; + - : string list = ["a"; " b"] + ]} + + Compare this to the behavior of splitting on the char itself. (Note that + the delimiters are not present in the output.) + + {[ + # Re.split (Re.compile (Re.char '\n')) "a\nb";; + - : string list = ["a"; "b"] + + # Re.split (Re.compile (Re.char ' ')) "a b";; + - : string list = ["a"; "b"] + ]} *) + val split : ?pos:int -> ?len:int -> re -> string -> string list + + (** [split_delim re s] splits [s] into chunks separated by [re]. It yields the + chunks themselves, not the separator. Occurences of the separator at the + beginning or the end of the string will produce empty chunks. + + {5 Examples:} + {[ + # let regex = Re.compile (Re.char ',');; + val regex : re = + + # Re.split regex "Re,Ocaml,Jerome Vouillon";; + - : string list = ["Re"; "Ocaml"; "Jerome Vouillon"] + + # Re.split regex "No commas in this sentence.";; + - : string list = ["No commas in this sentence."] + + # Re.split regex ",1,2,";; + - : string list = [""; "1"; "2"; ""] + + # Re.split ~pos:3 regex "1,2,3,4. Commas go brrr.";; + - : string list = ["3"; "4. Commas go brrr."] + ]} + + {6 Zero-length patterns:} + + Be careful when using [split_delim] with zero-length patterns like [eol], + [bow], and [eow]. Because they don't have any width, they will still be + present in the result. (Note the position of the [\n] and space characters + in the output.) + + {[ + # Re.split_delim (Re.compile Re.eol) "a\nb";; + - : string list = ["a"; "\nb"; ""] + + # Re.split_delim (Re.compile Re.bow) "a b";; + - : string list = [""; "a "; "b"] + + # Re.split_delim (Re.compile Re.eow) "a b";; + - : string list = ["a"; " b"; ""] + ]} + + Compare this to the behavior of splitting on the char itself. (Note that + the delimiters are not present in the output.) + + {[ + # Re.split_delim (Re.compile (Re.char '\n')) "a\nb";; + - : string list = ["a"; "b"] + + # Re.split_delim (Re.compile (Re.char ' ')) "a b";; + - : string list = ["a"; "b"] + ]} *) + val split_delim : ?pos:int -> ?len:int -> re -> string -> string list + + (** @deprecated Use {!module-Seq.split} instead. *) + val split_gen : ?pos:int -> ?len:int -> re -> string -> string gen + [@@ocaml.deprecated "Use Seq.split"] + + (** @deprecated Use {!module-Seq.split} instead. *) + val split_seq : ?pos:int -> ?len:int -> re -> string -> string Seq.t + [@@ocaml.deprecated "Use Seq.split"] + + (** [split re s] splits [s] into chunks separated by [re]. It yields the + chunks along with the separators. For instance this can be used with a + whitespace-matching re such as ["[\t ]+"]. + + {5 Examples:} + {[ + # let regex = Re.compile (Re.char ',');; + val regex : re = + + # Re.split_full regex "Re,Ocaml,Jerome Vouillon";; + - : Re.split_token list = + [`Text "Re"; `Delim ; `Text "Ocaml"; `Delim ; + `Text "Jerome Vouillon"] + + # Re.split_full regex "No commas in this sentence.";; + - : Re.split_token list = [`Text "No commas in this sentence."] + + # Re.split_full ~pos:3 regex "1,2,3,4. Commas go brrr.";; + - : Re.split_token list = + [`Delim ; `Text "3"; `Delim ; `Text "4. Commas go brrr."] + ]} *) + val split_full : ?pos:int -> ?len:int -> re -> string -> split_token list + + (** @deprecated Use {!module-Seq.split_full} instead. *) + val split_full_gen : ?pos:int -> ?len:int -> re -> string -> split_token gen + [@@ocaml.deprecated "Use Seq.split_full"] + + (** @deprecated Use {!module-Seq.split_full} instead. *) + val split_full_seq : ?pos:int -> ?len:int -> re -> string -> split_token Seq.t + [@@ocaml.deprecated "Use Seq.split_full"] + + module Seq : sig + (** Same as {!module-Re.val-all} but returns an iterator. + + {5 Examples:} + {[ + # let regex = Re.compile Re.(seq [str "my"; blank; word(rep alpha)]);; + val regex : re = + + # Re.Seq.all regex "my head, my shoulders, my knees, my toes ...";; + - : Re.Group.t Seq.t = + ]} + @since 1.10.0 *) + val all : + ?pos:int (** Default: 0 *) -> ?len:int -> re -> string -> Group.t Seq.t + + (** Same as {!module-Re.val-matches}, but returns an iterator. + + {5 Example:} + {[ + # let regex = Re.compile Re.(seq [str "my"; blank; word(rep alpha)]);; + val regex : re = + + # Re.Seq.matches regex "my head, my shoulders, my knees, my toes ...";; + - : string Seq.t = + ]} + @since 1.10.0 *) + val matches : + ?pos:int (** Default: 0 *) -> ?len:int -> re -> string -> string Seq.t + + (** Same as {!module-Re.val-split} but returns an iterator. + + {5 Example:} + {[ + # let regex = Re.compile (Re.char ',');; + val regex : re = + + # Re.Seq.split regex "Re,Ocaml,Jerome Vouillon";; + - : string Seq.t = + ]} + @since 1.10.0 *) + val split : + ?pos:int (** Default: 0 *) -> ?len:int -> re -> string -> string Seq.t + + (** Same as {!module-Re.val-split_delim} but returns an iterator. + + {5 Example:} + {[ + # let regex = Re.compile (Re.char ',');; + val regex : re = + + # Re.Seq.split regex "Re,Ocaml,Jerome Vouillon";; + - : string Seq.t = + ]} + @since 1.11.1 *) + val split_delim : + ?pos:int (** Default: 0 *) -> ?len:int -> re -> string -> string Seq.t + + (** Same as {!module-Re.val-split_full} but returns an iterator. + + {5 Example:} + {[ + # let regex = Re.compile (Re.char ',');; + val regex : re = + + # Re.Seq.split_full regex "Re,Ocaml,Jerome Vouillon";; + - : Re.split_token Seq.t = + ]} + @since 1.10.0 *) + val split_full : + ?pos:int (** Default: 0 *) -> + ?len:int -> + re -> + string -> + split_token Seq.t + end + + (** {2 String expressions (literal match)} *) + + val str : string -> t + val letter : letter -> t + val char : char -> letter + val int : int -> letter + + (** {2 Basic operations on regular expressions} *) + + (** Alternative. + + [alt []] is equivalent to {!empty}. + + By default, the leftmost match is preferred (see match semantics below). + *) + val alt : t list -> t + + (** Sequence *) + val seq : t list -> t + + (** Match nothing *) + val empty : t + + (** Empty word *) + val epsilon : t + + (** 0 or more matches *) + val rep : t -> t + + (** 1 or more matches *) + val rep1 : t -> t + + (** [repn re i j] matches [re] at least [i] times and at most [j] times, + bounds included. [j = None] means no upper bound. *) + val repn : t -> int -> int option -> t + + (** 0 or 1 matches *) + val opt : t -> t + + (** {2 String, line, word} + + We define a word as a sequence of latin1 letters, digits and underscore. + *) + + (** Beginning of line *) + val bol : t + + (** End of line *) + val eol : t + + (** Beginning of word *) + val bow : t + + (** End of word *) + val eow : t + + (** Beginning of string. This differs from {!start} because it matches the + beginning of the input string even when using [~pos] arguments: + + {[ + let b = execp (compile (seq [ bos; str "a" ])) "aa" ~pos:1 in + assert (not b) + ]} *) + val bos : t + + (** End of string. This is different from {!stop} in the way described in + {!bos}. *) + val eos : t + + (** Last end of line or end of string *) + val leol : t + + (** Initial position. This differs from {!bos} because it takes into account + the [~pos] arguments: + + {[ + let b = execp (compile (seq [ start; str "a" ])) "aa" ~pos:1 in + assert b + ]} *) + val start : t + + (** Final position. This is different from {!eos} in the way described in + {!start}. *) + val stop : t + + (** Word *) + val word : t -> t + + (** Not at a word boundary *) + val not_boundary : t + + (** Only matches the whole string, i.e. [fun t -> seq [ bos; t; eos ]]. *) + val whole_string : t -> t + + (** {2 Match semantics} + + A regular expression frequently matches a string in multiple ways. For + instance [exec (compile (opt (str "a"))) "ab"] can match "" or "a". Match + semantic can be modified with the functions below, allowing one to choose + which of these is preferable. + + By default, the leftmost branch of alternations is preferred, and + repetitions are greedy. + + Note that the existence of matches cannot be changed by specifying match + semantics. [seq [ bos; str "a"; non_greedy (opt (str "b")); eos ]] will + match when applied to "ab". However if + [seq [ bos; str "a"; non_greedy (opt (str "b")) ]] is applied to "ab", it + will match "a" rather than "ab". + + Also note that multiple match semantics can conflict. In this case, the + one executed earlier takes precedence. For instance, any match of + [shortest (seq [ bos; group (rep (str "a")); group (rep (str "a")); eos + ])] will always have an empty first group. Conversely, if we use + [longest] instead of [shortest], the second group will always be empty. *) + + (** Longest match semantics. That is, matches will match as many bytes as + possible. If multiple choices match the maximum amount of bytes, the one + respecting the inner match semantics is preferred. *) + val longest : t -> t + + (** Same as {!longest}, but matching the least number of bytes. *) + val shortest : t -> t + + (** First match semantics for alternations (not repetitions). That is, matches + will prefer the leftmost branch of the alternation that matches the text. + *) + val first : t -> t + + (** Greedy matches for repetitions ({!opt}, {!rep}, {!rep1}, {!repn}): they + will match as many times as possible. *) + val greedy : t -> t + + (** Non-greedy matches for repetitions ({!opt}, {!rep}, {!rep1}, {!repn}): + they will match as few times as possible. *) + val non_greedy : t -> t + + (** {2 Groups (or submatches)} *) + + (** Delimit a group. The group is considered as matching if it is used at + least once (it may be used multiple times if is nested inside {!rep} for + instance). If it is used multiple times, the last match is what gets + captured. *) + val group : ?name:string -> t -> t + + (** Remove all groups *) + val no_group : t -> t + + (** When matching against [nest e], only the group matching in the last match + of e will be considered as matching. + + For instance: + {[ + let re = compile (rep1 (nest (alt [ group (str "a"); str "b" ]))) in + let group = Re.exec re "ab" in + assert (Group.get_opt group 1 = None); + (* same thing but without [nest] *) + let re = compile (rep1 (alt [ group (str "a"); str "b" ])) in + let group = Re.exec re "ab" in + assert (Group.get_opt group 1 = Some "a") + ]} *) + val nest : t -> t + + (** Mark a regexp. the markid can then be used to know if this regexp was + used. *) + val mark : t -> Mark.t * t + + (** {2 Character sets} *) + + (** Any character of the string *) + val set : string -> t + + (** Character ranges *) + val rg : letter -> letter -> t + + (** Intersection of character sets *) + val inter : t list -> t + + (** Difference of character sets *) + val diff : t -> t -> t + + (** Complement of union *) + val compl : t list -> t + + (** {2 Predefined character sets} *) + + (** Any character *) + val any : t + + (** Any character but a newline *) + val notnl : t + + val alnum : t + val wordc : t + val alpha : t + val ascii : t + val blank : t + val cntrl : t + val digit : t + val graph : t + val lower : t + val print : t + val punct : t + val space : t + val upper : t + val xdigit : t + + (** {2 Case modifiers} *) + + (** Case sensitive matching. Note that this works on latin1, not ascii and not + utf8. *) + val case : t -> t + + (** Case insensitive matching. Note that this works on latin1, not ascii and + not utf8. *) + val no_case : t -> t + + (****) + + (** {2 Internal debugging} *) + + val pp : Format.formatter -> t -> unit + val pp_re : Format.formatter -> re -> unit + + (** Alias for {!pp_re}. Deprecated *) + val print_re : Format.formatter -> re -> unit + + (** {2 Experimental functions} *) + + (** [witness r] generates a string [s] such that [execp (compile r) s] is + true. + + Be warned that this function is buggy because it ignores zero-width + assertions like beginning of words. As a result it can generate incorrect + results. *) + val witness : t -> string + + (** {2 Deprecated functions} *) + + (** Alias for {!Group.t}. Deprecated *) + type substrings = Group.t [@@ocaml.deprecated "Use Group.t"] + + (** Same as {!Group.get}. Deprecated *) + val get : Group.t -> int -> string + [@@ocaml.deprecated "Use Group.get"] + + (** Same as {!Group.offset}. Deprecated *) + val get_ofs : Group.t -> int -> int * int + [@@ocaml.deprecated "Use Group.offset"] + + (** Same as {!Group.all}. Deprecated *) + val get_all : Group.t -> string array + [@@ocaml.deprecated "Use Group.all"] + + (** Same as {!Group.all_offset}. Deprecated *) + val get_all_ofs : Group.t -> (int * int) array + [@@ocaml.deprecated "Use Group.all_offset"] + + (** Same as {!Group.test}. Deprecated *) + val test : Group.t -> int -> bool + [@@ocaml.deprecated "Use Group.test"] + + (** Alias for {!Mark.t}. Deprecated *) + type markid = Mark.t [@@ocaml.deprecated "Use Mark."] + + (** Same as {!Mark.test}. Deprecated *) + val marked : Group.t -> Mark.t -> bool + [@@ocaml.deprecated "Use Mark.test"] + + (** Same as {!Mark.all}. Deprecated *) + val mark_set : Group.t -> Mark.Set.t + [@@ocaml.deprecated "Use Mark.all"] + + module Stream : sig + (** An experimental for matching a regular expression by feeding individual + string chunks. + + This module is not covered by semver's stability guarantee. *) + + type t + type 'a feed = Ok of 'a | No_match + + val create : re -> t + val feed : t -> string -> pos:int -> len:int -> t feed + + (** [finalize s ~pos ~len] feed [s] from [pos] to [len] and return whether + the regular expression matched. *) + val finalize : t -> string -> pos:int -> len:int -> bool + + module Group : sig + (** Match a string against a regular expression with capture groups *) + + type stream := t + type t + + module Match : sig + type t + + val get : t -> int -> string option + val test_mark : t -> Pmark.t -> bool + end + + val create : stream -> t + val feed : t -> string -> pos:int -> len:int -> t feed + val finalize : t -> string -> pos:int -> len:int -> Match.t feed + end + end +end + +module Make + (Cset : Cset.T) + (Color_map : + Color_map.T + with type cp = Cset.cp + and type letter = Cset.letter + and type cset_t = Cset.t) = +struct + module Ast = Ast.Make (Cset) (Color_map) + module Compile = Compile.Make (Cset) (Color_map) + module Search = Search.Make (Cset) (Color_map) + + include struct + let cset = Ast.cset + let rg c c' = cset (Cset.cseq c c') + let notnl = cset Cset.notnl + let lower = cset Cset.lower + let upper = cset Cset.upper + let alpha = cset Cset.alpha + let digit = cset Cset.cdigit + let alnum = cset Cset.alnum + let wordc = cset Cset.wordc + let ascii = cset Cset.ascii + let blank = cset Cset.blank + let cntrl = cset Cset.cntrl + let graph = cset Cset.graph + let print = cset Cset.print + let punct = cset Cset.punct + let space = cset Cset.space + let xdigit = cset Cset.xdigit + end + + include Ast.Export + + let exec_internal ?(pos = 0) ?(len = -1) ~partial ~groups re s = + Compile.match_str ~groups ~partial re s ~pos ~len + + let exec ?pos ?len re s = + match exec_internal ?pos ?len ~groups:true ~partial:false re s with + | Match substr -> substr + | _ -> raise Not_found + + let exec_opt ?pos ?len re s = + match exec_internal ?pos ?len ~groups:true ~partial:false re s with + | Match substr -> Some substr + | _ -> None + + let execp ?(pos = 0) ?(len = -1) re s = Compile.match_str_p ~pos ~len re s + + let exec_partial ?pos ?len re s = + match exec_internal ~groups:false ~partial:true ?pos ?len re s with + | Match _ -> `Full + | Running _ -> `Partial + | Failed -> `Mismatch + + let exec_partial_detailed ?pos ?len re s = + match exec_internal ~groups:true ~partial:true ?pos ?len re s with + | Match group -> `Full group + | Running { no_match_starts_before } -> `Partial no_match_starts_before + | Failed -> `Mismatch + + module Mark = struct + type t = Pmark.t + + let test (g : Group.t) p = Pmark.Set.mem p (Group.pmarks g) + let all (g : Group.t) = Group.pmarks g + + module Set = Pmark.Set + + let equal = Pmark.equal + let compare = Pmark.compare + end + + type split_token = [ `Text of string | `Delim of Group.t ] + + module Gen = struct + type 'a gen = unit -> 'a option + + let gen_of_seq (s : 'a Seq.t) : 'a gen = + let r = ref s in + fun () -> + match !r () with + | Seq.Nil -> None + | Seq.Cons (x, tl) -> + r := tl; + Some x + + let split ?pos ?len re s : _ gen = Search.split ?pos ?len re s |> gen_of_seq + + let split_full ?pos ?len re s : _ gen = + Search.split_full ?pos ?len re s |> gen_of_seq + + let all ?pos ?len re s = Search.all ?pos ?len re s |> gen_of_seq + let matches ?pos ?len re s = Search.matches ?pos ?len re s |> gen_of_seq + end + + module Group = Group + + (** {2 Deprecated functions} *) + + let split_full_seq = Search.split_full + let split_seq = Search.split + let matches_seq = Search.matches + let all_seq = Search.all + + type 'a gen = 'a Gen.gen + + let all_gen = Gen.all + let matches_gen = Gen.matches + let split_gen = Gen.split + let split_full_gen = Gen.split_full + + type substrings = Group.t + + let get = Group.get + let get_ofs = Group.offset + let get_all = Group.all + let get_all_ofs = Group.all_offset + let test = Group.test + + type markid = Mark.t + + let marked = Mark.test + let mark_set = Mark.all + + type groups = Group.t + + module List = struct + let list_of_seq (s : 'a Seq.t) : 'a list = + Seq.fold_left (fun l x -> x :: l) [] s |> List.rev + + let all ?pos ?len re s = Search.all ?pos ?len re s |> list_of_seq + let matches ?pos ?len re s = Search.matches ?pos ?len re s |> list_of_seq + + let split_full ?pos ?len re s = + Search.split_full ?pos ?len re s |> list_of_seq + + let split ?pos ?len re s = Search.split ?pos ?len re s |> list_of_seq + + let split_delim ?pos ?len re s = + Search.split_delim ?pos ?len re s |> list_of_seq + end + + include List + + include struct + open Compile + + type nonrec re = re + + let compile = compile + let pp_re = pp_re + let print_re = pp_re + let group_names = group_names + let group_count = group_count + end + + module Seq = Search + module Stream = Compile.Stream +end diff --git a/lib/unicode/core.mli b/lib/unicode/core.mli new file mode 100644 index 00000000..3f4e438c --- /dev/null +++ b/lib/unicode/core.mli @@ -0,0 +1,840 @@ +(* + RE - A regular expression library + + Copyright (C) 2001 Jerome Vouillon + email: Jerome.Vouillon@pps.jussieu.fr + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation, with + linking exception; either version 2.1 of the License, or (at + your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +*) + +(** Module [Core]: code for creating and using regular expressions, independently + of regular expression syntax. *) +module type T = sig + type letter + + (** Regular expression *) + type t + + (** Compiled regular expression *) + type re + + (** Manipulate matching groups. *) + module Group : sig + (** Information about groups in a match. As is conventional, every match + implicitly has a group 0 that covers the whole match, and explicit + groups are numbered from 1. *) + type t = Group.t + + (** Raise [Not_found] if the group did not match *) + val get : t -> int -> string + + (** Similar to {!get}, but returns an option instead of using an exception. + *) + val get_opt : t -> int -> string option + + (** Raise [Not_found] if the group did not match *) + val offset : t -> int -> int * int + + (** Similar to {!offset}, but returns an option instead of using an + exception. *) + val offset_opt : t -> int -> (int * int) option + + (** Return the start of the match. Raise [Not_found] if the group did not + match. *) + val start : t -> int -> int + + (** Similar to {!start_opt}, but returns an option instead of using an + exception. *) + val start_opt : t -> int -> int option + + (** Return the end of the match. Raise [Not_found] if the group did not + match. *) + val stop : t -> int -> int + + (** Similar to {!stop_opt}, but returns an option instead of using an + exception. *) + val stop_opt : t -> int -> int option + + (** Return the empty string for each group which did not match *) + val all : t -> string array + + (** Return [(-1,-1)] for each group which did not match *) + val all_offset : t -> (int * int) array + + (** Test whether a group matched *) + val test : t -> int -> bool + + (** Returns the total number of groups defined - matched or not. This + function is experimental. *) + val nb_groups : t -> int + + val pp : Format.formatter -> t -> unit + end + + type groups = Group.t [@@ocaml.deprecated "Use Group.t"] + + (** {2 Compilation and execution of a regular expression} *) + + (** Compile a regular expression into an executable version that can be used + to match strings, e.g. with {!exec}. *) + val compile : t -> re + + (** Return the number of capture groups (including the one corresponding to + the entire regexp). *) + val group_count : re -> int + + (** Return named capture groups with their index. *) + val group_names : re -> (string * int) list + + (** [exec re str] searches [str] for a match of the compiled expression [re], + and returns the matched groups if any. + + More specifically, when a match exists, [exec] returns a match that starts + at the earliest position possible. If multiple such matches are possible, + the one specified by the match semantics described below is returned. + + {5 Examples:} + {[ + # let regex = Re.compile Re.(seq [str "//"; rep print ]);; + val regex : re = + + # Re.exec regex "// a C comment";; + - : Re.Group.t = + + # Re.exec regex "# a C comment?";; + Exception: Not_found + + # Re.exec ~pos:1 regex "// a C comment";; + Exception: Not_found + ]} + + @param pos optional beginning of the string (default 0) + @param len + length of the substring of [str] that can be matched (default [-1], + meaning to the end of the string) + @raise Not_found if the regular expression can't be found in [str] *) + val exec : + ?pos:int (** Default: 0 *) -> + ?len:int (** Default: -1 (until end of string) *) -> + re -> + string -> + Group.t + + (** Similar to {!exec}, but returns an option instead of using an exception. + + {5 Examples:} + {[ + # let regex = Re.compile Re.(seq [str "//"; rep print ]);; + val regex : re = + + # Re.exec_opt regex "// a C comment";; + - : Re.Group.t option = Some + + # Re.exec_opt regex "# a C comment?";; + - : Re.Group.t option = None + + # Re.exec_opt ~pos:1 regex "// a C comment";; + - : Re.Group.t option = None + ]} *) + val exec_opt : + ?pos:int (** Default: 0 *) -> + ?len:int (** Default: -1 (until end of string) *) -> + re -> + string -> + Group.t option + + (** Similar to {!exec}, but returns [true] if the expression matches, and + [false] if it doesn't. This function is more efficient than calling + {!exec} or {!exec_opt} and ignoring the returned group. + + {5 Examples:} + {[ + # let regex = Re.compile Re.(seq [str "//"; rep print ]);; + val regex : re = + + # Re.execp regex "// a C comment";; + - : bool = true + + # Re.execp ~pos:1 regex "// a C comment";; + - : bool = false + ]} *) + val execp : + ?pos:int (** Default: 0 *) -> + ?len:int (** Default: -1 (until end of string) *) -> + re -> + string -> + bool + + (** More detailed version of {!execp}. [`Full] is equivalent to [true], while + [`Mismatch] and [`Partial] are equivalent to [false], but [`Partial] + indicates the input string could be extended to create a match. + + {5 Examples:} + {[ + # let regex = Re.compile Re.(seq [bos; str "// a C comment"]);; + val regex : re = + + # Re.exec_partial regex "// a C comment here.";; + - : [ `Full | `Mismatch | `Partial ] = `Full + + # Re.exec_partial regex "// a C comment";; + - : [ `Full | `Mismatch | `Partial ] = `Partial + + # Re.exec_partial regex "//";; + - : [ `Full | `Mismatch | `Partial ] = `Partial + + # Re.exec_partial regex "# a C comment?";; + - : [ `Full | `Mismatch | `Partial ] = `Mismatch + ]} *) + val exec_partial : + ?pos:int (** Default: 0 *) -> + ?len:int (** Default: -1 (until end of string) *) -> + re -> + string -> + [ `Full | `Partial | `Mismatch ] + + (** More detailed version of {!exec_opt}. [`Full group] is equivalent to + [Some group], while [`Mismatch] and [`Partial _] are equivalent to [None], + but [`Partial position] indicates that the input string could be extended + to create a match, and no match could start in the input string before the + given position. This could be used to not have to search the entirety of + the input if more becomes available, and use the given position as the + [?pos] argument. *) + val exec_partial_detailed : + ?pos:int (** Default: 0 *) -> + ?len:int (** Default: -1 (until end of string) *) -> + re -> + string -> + [ `Full of Group.t | `Partial of int | `Mismatch ] + + (** Marks *) + module Mark : sig + (** Mark id *) + type t = Pmark.t + + (** Tell if a mark was matched. *) + val test : Group.t -> t -> bool + + module Set : Set.S with type elt = t + + (** Return all the mark matched. *) + val all : Group.t -> Set.t + + val equal : t -> t -> bool + val compare : t -> t -> int + end + + (** {2 High Level Operations} *) + + type split_token = + [ `Text of string (** Text between delimiters *) + | `Delim of Group.t (** Delimiter *) ] + + (** Repeatedly calls {!exec} on the given string, starting at given position + and length. + + {5 Examples:} + {[ + # let regex = Re.compile Re.(seq [str "my"; blank; word(rep alpha)]);; + val regex : re = + + # Re.all regex "my head, my shoulders, my knees, my toes ...";; + - : Re.Group.t list = [; ; ; ] + + # Re.all regex "My head, My shoulders, My knees, My toes ...";; + - : Re.Group.t list = [] + ]} *) + val all : ?pos:int -> ?len:int -> re -> string -> Group.t list + + type 'a gen = unit -> 'a option + + (** @deprecated Use {!module-Seq.all} instead. *) + val all_gen : ?pos:int -> ?len:int -> re -> string -> Group.t gen + [@@ocaml.deprecated "Use Seq.all"] + + (** @deprecated Use {!module-Seq.all} instead. *) + val all_seq : ?pos:int -> ?len:int -> re -> string -> Group.t Seq.t + [@@ocaml.deprecated "Use Seq.all"] + + (** Same as {!all}, but extracts the matched substring rather than returning + the whole group. This basically iterates over matched strings. + + {5 Examples:} + {[ + # let regex = Re.compile Re.(seq [str "my"; blank; word(rep alpha)]);; + val regex : re = + + # Re.matches regex "my head, my shoulders, my knees, my toes ...";; + - : string list = ["my head"; "my shoulders"; "my knees"; "my toes"] + + # Re.matches regex "My head, My shoulders, My knees, My toes ...";; + - : string list = [] + + # Re.matches regex "my my my my head my 1 toe my ...";; + - : string list = ["my my"; "my my"] + + # Re.matches ~pos:2 regex "my my my my head my +1 toe my ...";; + - : string list = ["my my"; "my head"] + ]} *) + val matches : ?pos:int -> ?len:int -> re -> string -> string list + + (** @deprecated Use {!module-Seq.matches} instead. *) + val matches_gen : ?pos:int -> ?len:int -> re -> string -> string gen + [@@ocaml.deprecated "Use Seq.matches"] + + (** @deprecated Use {!module-Seq.matches} instead. *) + val matches_seq : ?pos:int -> ?len:int -> re -> string -> string Seq.t + [@@ocaml.deprecated "Use Seq.matches"] + + (** [split re s] splits [s] into chunks separated by [re]. It yields the + chunks themselves, not the separator. An occurence of the separator at the + beginning or the end of the string is ignoring. + + {5 Examples:} + {[ + # let regex = Re.compile (Re.char ',');; + val regex : re = + + # Re.split regex "Re,Ocaml,Jerome Vouillon";; + - : string list = ["Re"; "Ocaml"; "Jerome Vouillon"] + + # Re.split regex "No commas in this sentence.";; + - : string list = ["No commas in this sentence."] + + # Re.split regex ",1,2,";; + - : string list = ["1"; "2"] + + # Re.split ~pos:3 regex "1,2,3,4. Commas go brrr.";; + - : string list = ["3"; "4. Commas go brrr."] + ]} + + {6 Zero-length patterns:} + + Be careful when using [split] with zero-length patterns like [eol], [bow], + and [eow]. Because they don't have any width, they will still be present + in the result. (Note the position of the [\n] and space characters in the + output.) + + {[ + # Re.split (Re.compile Re.eol) "a\nb";; + - : string list = ["a"; "\nb"] + + # Re.split (Re.compile Re.bow) "a b";; + - : string list = ["a "; "b"] + + # Re.split (Re.compile Re.eow) "a b";; + - : string list = ["a"; " b"] + ]} + + Compare this to the behavior of splitting on the char itself. (Note that + the delimiters are not present in the output.) + + {[ + # Re.split (Re.compile (Re.char '\n')) "a\nb";; + - : string list = ["a"; "b"] + + # Re.split (Re.compile (Re.char ' ')) "a b";; + - : string list = ["a"; "b"] + ]} *) + val split : ?pos:int -> ?len:int -> re -> string -> string list + + (** [split_delim re s] splits [s] into chunks separated by [re]. It yields the + chunks themselves, not the separator. Occurences of the separator at the + beginning or the end of the string will produce empty chunks. + + {5 Examples:} + {[ + # let regex = Re.compile (Re.char ',');; + val regex : re = + + # Re.split regex "Re,Ocaml,Jerome Vouillon";; + - : string list = ["Re"; "Ocaml"; "Jerome Vouillon"] + + # Re.split regex "No commas in this sentence.";; + - : string list = ["No commas in this sentence."] + + # Re.split regex ",1,2,";; + - : string list = [""; "1"; "2"; ""] + + # Re.split ~pos:3 regex "1,2,3,4. Commas go brrr.";; + - : string list = ["3"; "4. Commas go brrr."] + ]} + + {6 Zero-length patterns:} + + Be careful when using [split_delim] with zero-length patterns like [eol], + [bow], and [eow]. Because they don't have any width, they will still be + present in the result. (Note the position of the [\n] and space characters + in the output.) + + {[ + # Re.split_delim (Re.compile Re.eol) "a\nb";; + - : string list = ["a"; "\nb"; ""] + + # Re.split_delim (Re.compile Re.bow) "a b";; + - : string list = [""; "a "; "b"] + + # Re.split_delim (Re.compile Re.eow) "a b";; + - : string list = ["a"; " b"; ""] + ]} + + Compare this to the behavior of splitting on the char itself. (Note that + the delimiters are not present in the output.) + + {[ + # Re.split_delim (Re.compile (Re.char '\n')) "a\nb";; + - : string list = ["a"; "b"] + + # Re.split_delim (Re.compile (Re.char ' ')) "a b";; + - : string list = ["a"; "b"] + ]} *) + val split_delim : ?pos:int -> ?len:int -> re -> string -> string list + + (** @deprecated Use {!module-Seq.split} instead. *) + val split_gen : ?pos:int -> ?len:int -> re -> string -> string gen + [@@ocaml.deprecated "Use Seq.split"] + + (** @deprecated Use {!module-Seq.split} instead. *) + val split_seq : ?pos:int -> ?len:int -> re -> string -> string Seq.t + [@@ocaml.deprecated "Use Seq.split"] + + (** [split re s] splits [s] into chunks separated by [re]. It yields the + chunks along with the separators. For instance this can be used with a + whitespace-matching re such as ["[\t ]+"]. + + {5 Examples:} + {[ + # let regex = Re.compile (Re.char ',');; + val regex : re = + + # Re.split_full regex "Re,Ocaml,Jerome Vouillon";; + - : Re.split_token list = + [`Text "Re"; `Delim ; `Text "Ocaml"; `Delim ; + `Text "Jerome Vouillon"] + + # Re.split_full regex "No commas in this sentence.";; + - : Re.split_token list = [`Text "No commas in this sentence."] + + # Re.split_full ~pos:3 regex "1,2,3,4. Commas go brrr.";; + - : Re.split_token list = + [`Delim ; `Text "3"; `Delim ; `Text "4. Commas go brrr."] + ]} *) + val split_full : ?pos:int -> ?len:int -> re -> string -> split_token list + + (** @deprecated Use {!module-Seq.split_full} instead. *) + val split_full_gen : ?pos:int -> ?len:int -> re -> string -> split_token gen + [@@ocaml.deprecated "Use Seq.split_full"] + + (** @deprecated Use {!module-Seq.split_full} instead. *) + val split_full_seq : ?pos:int -> ?len:int -> re -> string -> split_token Seq.t + [@@ocaml.deprecated "Use Seq.split_full"] + + module Seq : sig + (** Same as {!module-Re.val-all} but returns an iterator. + + {5 Examples:} + {[ + # let regex = Re.compile Re.(seq [str "my"; blank; word(rep alpha)]);; + val regex : re = + + # Re.Seq.all regex "my head, my shoulders, my knees, my toes ...";; + - : Re.Group.t Seq.t = + ]} + @since 1.10.0 *) + val all : + ?pos:int (** Default: 0 *) -> ?len:int -> re -> string -> Group.t Seq.t + + (** Same as {!module-Re.val-matches}, but returns an iterator. + + {5 Example:} + {[ + # let regex = Re.compile Re.(seq [str "my"; blank; word(rep alpha)]);; + val regex : re = + + # Re.Seq.matches regex "my head, my shoulders, my knees, my toes ...";; + - : string Seq.t = + ]} + @since 1.10.0 *) + val matches : + ?pos:int (** Default: 0 *) -> ?len:int -> re -> string -> string Seq.t + + (** Same as {!module-Re.val-split} but returns an iterator. + + {5 Example:} + {[ + # let regex = Re.compile (Re.char ',');; + val regex : re = + + # Re.Seq.split regex "Re,Ocaml,Jerome Vouillon";; + - : string Seq.t = + ]} + @since 1.10.0 *) + val split : + ?pos:int (** Default: 0 *) -> ?len:int -> re -> string -> string Seq.t + + (** Same as {!module-Re.val-split_delim} but returns an iterator. + + {5 Example:} + {[ + # let regex = Re.compile (Re.char ',');; + val regex : re = + + # Re.Seq.split regex "Re,Ocaml,Jerome Vouillon";; + - : string Seq.t = + ]} + @since 1.11.1 *) + val split_delim : + ?pos:int (** Default: 0 *) -> ?len:int -> re -> string -> string Seq.t + + (** Same as {!module-Re.val-split_full} but returns an iterator. + + {5 Example:} + {[ + # let regex = Re.compile (Re.char ',');; + val regex : re = + + # Re.Seq.split_full regex "Re,Ocaml,Jerome Vouillon";; + - : Re.split_token Seq.t = + ]} + @since 1.10.0 *) + val split_full : + ?pos:int (** Default: 0 *) -> + ?len:int -> + re -> + string -> + split_token Seq.t + end + + (** {2 String expressions (literal match)} *) + + val str : string -> t + val letter : letter -> t + val char : char -> letter + val int : int -> letter + + (** {2 Basic operations on regular expressions} *) + + (** Alternative. + + [alt []] is equivalent to {!empty}. + + By default, the leftmost match is preferred (see match semantics below). + *) + val alt : t list -> t + + (** Sequence *) + val seq : t list -> t + + (** Match nothing *) + val empty : t + + (** Empty word *) + val epsilon : t + + (** 0 or more matches *) + val rep : t -> t + + (** 1 or more matches *) + val rep1 : t -> t + + (** [repn re i j] matches [re] at least [i] times and at most [j] times, + bounds included. [j = None] means no upper bound. *) + val repn : t -> int -> int option -> t + + (** 0 or 1 matches *) + val opt : t -> t + + (** {2 String, line, word} + + We define a word as a sequence of latin1 letters, digits and underscore. + *) + + (** Beginning of line *) + val bol : t + + (** End of line *) + val eol : t + + (** Beginning of word *) + val bow : t + + (** End of word *) + val eow : t + + (** Beginning of string. This differs from {!start} because it matches the + beginning of the input string even when using [~pos] arguments: + + {[ + let b = execp (compile (seq [ bos; str "a" ])) "aa" ~pos:1 in + assert (not b) + ]} *) + val bos : t + + (** End of string. This is different from {!stop} in the way described in + {!bos}. *) + val eos : t + + (** Last end of line or end of string *) + val leol : t + + (** Initial position. This differs from {!bos} because it takes into account + the [~pos] arguments: + + {[ + let b = execp (compile (seq [ start; str "a" ])) "aa" ~pos:1 in + assert b + ]} *) + val start : t + + (** Final position. This is different from {!eos} in the way described in + {!start}. *) + val stop : t + + (** Word *) + val word : t -> t + + (** Not at a word boundary *) + val not_boundary : t + + (** Only matches the whole string, i.e. [fun t -> seq [ bos; t; eos ]]. *) + val whole_string : t -> t + + (** {2 Match semantics} + + A regular expression frequently matches a string in multiple ways. For + instance [exec (compile (opt (str "a"))) "ab"] can match "" or "a". Match + semantic can be modified with the functions below, allowing one to choose + which of these is preferable. + + By default, the leftmost branch of alternations is preferred, and + repetitions are greedy. + + Note that the existence of matches cannot be changed by specifying match + semantics. [seq [ bos; str "a"; non_greedy (opt (str "b")); eos ]] will + match when applied to "ab". However if + [seq [ bos; str "a"; non_greedy (opt (str "b")) ]] is applied to "ab", it + will match "a" rather than "ab". + + Also note that multiple match semantics can conflict. In this case, the + one executed earlier takes precedence. For instance, any match of + [shortest (seq [ bos; group (rep (str "a")); group (rep (str "a")); eos + ])] will always have an empty first group. Conversely, if we use + [longest] instead of [shortest], the second group will always be empty. *) + + (** Longest match semantics. That is, matches will match as many bytes as + possible. If multiple choices match the maximum amount of bytes, the one + respecting the inner match semantics is preferred. *) + val longest : t -> t + + (** Same as {!longest}, but matching the least number of bytes. *) + val shortest : t -> t + + (** First match semantics for alternations (not repetitions). That is, matches + will prefer the leftmost branch of the alternation that matches the text. + *) + val first : t -> t + + (** Greedy matches for repetitions ({!opt}, {!rep}, {!rep1}, {!repn}): they + will match as many times as possible. *) + val greedy : t -> t + + (** Non-greedy matches for repetitions ({!opt}, {!rep}, {!rep1}, {!repn}): + they will match as few times as possible. *) + val non_greedy : t -> t + + (** {2 Groups (or submatches)} *) + + (** Delimit a group. The group is considered as matching if it is used at + least once (it may be used multiple times if is nested inside {!rep} for + instance). If it is used multiple times, the last match is what gets + captured. *) + val group : ?name:string -> t -> t + + (** Remove all groups *) + val no_group : t -> t + + (** When matching against [nest e], only the group matching in the last match + of e will be considered as matching. + + For instance: + {[ + let re = compile (rep1 (nest (alt [ group (str "a"); str "b" ]))) in + let group = Re.exec re "ab" in + assert (Group.get_opt group 1 = None); + (* same thing but without [nest] *) + let re = compile (rep1 (alt [ group (str "a"); str "b" ])) in + let group = Re.exec re "ab" in + assert (Group.get_opt group 1 = Some "a") + ]} *) + val nest : t -> t + + (** Mark a regexp. the markid can then be used to know if this regexp was + used. *) + val mark : t -> Mark.t * t + + (** {2 Character sets} *) + + (** Any character of the string *) + val set : string -> t + + (** Character ranges *) + val rg : letter -> letter -> t + + (** Intersection of character sets *) + val inter : t list -> t + + (** Difference of character sets *) + val diff : t -> t -> t + + (** Complement of union *) + val compl : t list -> t + + (** {2 Predefined character sets} *) + + (** Any character *) + val any : t + + (** Any character but a newline *) + val notnl : t + + val alnum : t + val wordc : t + val alpha : t + val ascii : t + val blank : t + val cntrl : t + val digit : t + val graph : t + val lower : t + val print : t + val punct : t + val space : t + val upper : t + val xdigit : t + + (** {2 Case modifiers} *) + + (** Case sensitive matching. Note that this works on latin1, not ascii and not + utf8. *) + val case : t -> t + + (** Case insensitive matching. Note that this works on latin1, not ascii and + not utf8. *) + val no_case : t -> t + + (****) + + (** {2 Internal debugging} *) + + val pp : Format.formatter -> t -> unit + val pp_re : Format.formatter -> re -> unit + + (** Alias for {!pp_re}. Deprecated *) + val print_re : Format.formatter -> re -> unit + + (** {2 Experimental functions} *) + + (** [witness r] generates a string [s] such that [execp (compile r) s] is + true. + + Be warned that this function is buggy because it ignores zero-width + assertions like beginning of words. As a result it can generate incorrect + results. *) + val witness : t -> string + + (** {2 Deprecated functions} *) + + (** Alias for {!Group.t}. Deprecated *) + type substrings = Group.t [@@ocaml.deprecated "Use Group.t"] + + (** Same as {!Group.get}. Deprecated *) + val get : Group.t -> int -> string + [@@ocaml.deprecated "Use Group.get"] + + (** Same as {!Group.offset}. Deprecated *) + val get_ofs : Group.t -> int -> int * int + [@@ocaml.deprecated "Use Group.offset"] + + (** Same as {!Group.all}. Deprecated *) + val get_all : Group.t -> string array + [@@ocaml.deprecated "Use Group.all"] + + (** Same as {!Group.all_offset}. Deprecated *) + val get_all_ofs : Group.t -> (int * int) array + [@@ocaml.deprecated "Use Group.all_offset"] + + (** Same as {!Group.test}. Deprecated *) + val test : Group.t -> int -> bool + [@@ocaml.deprecated "Use Group.test"] + + (** Alias for {!Mark.t}. Deprecated *) + type markid = Mark.t [@@ocaml.deprecated "Use Mark."] + + (** Same as {!Mark.test}. Deprecated *) + val marked : Group.t -> Mark.t -> bool + [@@ocaml.deprecated "Use Mark.test"] + + (** Same as {!Mark.all}. Deprecated *) + val mark_set : Group.t -> Mark.Set.t + [@@ocaml.deprecated "Use Mark.all"] + + module Stream : sig + (** An experimental for matching a regular expression by feeding individual + string chunks. + + This module is not covered by semver's stability guarantee. *) + + type t + type 'a feed = Ok of 'a | No_match + + val create : re -> t + val feed : t -> string -> pos:int -> len:int -> t feed + + (** [finalize s ~pos ~len] feed [s] from [pos] to [len] and return whether + the regular expression matched. *) + val finalize : t -> string -> pos:int -> len:int -> bool + + module Group : sig + (** Match a string against a regular expression with capture groups *) + + type stream := t + type t + + module Match : sig + type t + + val get : t -> int -> string option + val test_mark : t -> Pmark.t -> bool + end + + val create : stream -> t + val feed : t -> string -> pos:int -> len:int -> t feed + val finalize : t -> string -> pos:int -> len:int -> Match.t feed + end + end +end + +module Make + (Cset : Cset.T) + (Color_map : + Color_map.T + with type cp = Cset.cp + and type letter = Cset.letter + and type cset_t = Cset.t) : + T + with type t = Ast.Make(Cset)(Color_map).t + and type re = Compile.Make(Cset)(Color_map).re + and type letter = Cset.letter diff --git a/lib/unicode/cset.ml b/lib/unicode/cset.ml new file mode 100644 index 00000000..4478d8e1 --- /dev/null +++ b/lib/unicode/cset.ml @@ -0,0 +1,511 @@ +(* + RE - A regular expression library + + Copyright (C) 2001 Jerome Vouillon + email: Jerome.Vouillon@pps.jussieu.fr + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation, with + linking exception; either version 2.1 of the License, or (at + your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +*) + +module type Categories = sig + type c + + val cany : (c * c) list + val ascii : (c * c) list + val cdigit : (c * c) list + val cupper : (c * c) list + val clower : (c * c) list + val cword : (c * c) list + val calpha : (c * c) list + val calnum : (c * c) list + val xdigit : (c * c) list + val lower : (c * c) list + val upper : (c * c) list + val alpha : (c * c) list + val alnum : (c * c) list + val wordc : (c * c) list + val nl : (c * c) list + val blank : (c * c) list + val space : (c * c) list + val cntrl : (c * c) list + val graph : (c * c) list + val print : (c * c) list + val punct : (c * c) list +end + +module type CodePage = sig + type t [@@immediate] + type letter + + val to_letter : t -> letter + val from_letter : letter -> t + val null : t + val equal : t -> t -> bool + val compare : t -> t -> int + val succ : t -> t + val pred : t -> t + val max : t -> t -> t + val min : t -> t -> t + val max_t : t + val min_t : t + val offset : int -> t -> t + val to_int : t -> int + val of_int : int -> t + val of_char : char -> letter + val to_char : letter -> char + val pp : Format.formatter -> t -> unit + + module Categories : Categories with type c := t +end + +module type T = sig + type cp [@@immediate] + type letter + type t + + module CodePage : CodePage with type t := cp and type letter := letter + module Codec : Uucodecs.T with type letter = letter + + val equal : t -> t -> bool + val iter : t -> f:(cp -> cp -> unit) -> unit + val union : t -> t -> t + val union_all : t list -> t + val intersect_all : t list -> t + val inter : t -> t -> t + val diff : t -> t -> t + val empty : t + val single : cp -> t + val add : cp -> t -> t + val mem : cp -> t -> bool + val case_insens : t -> t + val cany : t + val ascii : t + val cdigit : t + val cupper : t + val clower : t + val calpha : t + val cword : t + val notnl : t + val nl : t + val cseq : letter -> letter -> t + val set : string -> t + val blank : t + val space : t + val xdigit : t + val lower : t + val upper : t + val alpha : t + val alnum : t + val wordc : t + val cntrl : t + val graph : t + val print : t + val punct : t + val pp : Format.formatter -> t -> unit + val one_c : t -> cp option + val fold_left : t -> init:'acc -> f:('acc -> cp -> cp -> 'acc) -> 'acc + val fold_right : t -> init:'acc -> f:(cp -> cp -> 'acc -> 'acc) -> 'acc + val hash : t -> int + val compare : t -> t -> int + + module CSetMap : Map.S with type key = int * t + + val csingle : letter -> t + val is_empty : t -> bool + val prepend : t -> 'a list -> (t * 'a list) list -> (t * 'a list) list + val pick : t -> cp + val offset : int -> t -> t + val to_dyn : t -> Dyn.t +end + +module UcharCp : CodePage with type letter = Uchar.t = struct + type t = int + type letter = Uchar.t + + external to_int : t -> int = "%identity" + external of_int : int -> t = "%identity" + + let of_char = Uchar.of_char + let to_char = Uchar.to_char + let min = Int.min + let max = Int.max + let max_t = 0x10ffff + let min_t = 0 + let to_letter c = Uchar.unsafe_of_int @@ of_int c + let from_letter u = of_int @@ Uchar.to_int u + let equal = fun cp cp' -> Int.equal (of_int cp) (of_int cp') + let compare = fun cp cp' -> Int.compare (of_int cp) (of_int cp') + + let succ t = + try Uchar.of_int t |> Uchar.succ |> Uchar.to_int with _ -> max_t + + let pred t = + try Uchar.of_int t |> Uchar.pred |> Uchar.to_int with _ -> min_t + + let offset = + fun ofs cp -> + let cp' = cp + ofs in + if cp' > max_t then max_t + else if cp <= 0xd7ff && cp' > 0xd7ff then 0xe000 - 0xd7ff + cp' + else cp' + + let null = of_int (-1) + let pp ppf t = Format.fprintf ppf "%d" t + + module Categories = struct + include Unicode.Regexp + + let ascii = [ (0x00, 0x7F) ] + let cany = [ (0x0000, 0xd7ff); (0xe000, 0x10ffff) ] + let blank = [ (0x0009, 0x0009); (0x0020, 0x0020) ] + end +end + +module Make + (Codec : Uucodecs.T) + (Cp : CodePage with type letter = Codec.letter) : + T with type cp = Cp.t and type letter = Codec.letter = struct + type cp = Cp.t + type letter = Codec.letter + + module CodePage = Cp + module Codec = Codec + include CodePage.Categories + open! Import + + type t = (cp * cp) list + + (* type t = (cp * cp) array *) + + let equal_pair (x, y) (x', y') = CodePage.equal x x' && CodePage.equal y y' + + let compare_pair (x, y) (x', y') = + match CodePage.compare x x' with 0 -> CodePage.compare y y' | x -> x + + let equal = List.equal ~eq:equal_pair + let compare : t -> t -> int = List.compare ~cmp:compare_pair + + let print_one ppf (c1, c2) = + if CodePage.equal c1 c2 then Format.fprintf ppf "%a" CodePage.pp c1 + else Format.fprintf ppf "%a-%a" CodePage.pp c1 CodePage.pp c2 + + let pp ppf t = + Format.pp_print_list + ~pp_sep:(fun ppf () -> Format.fprintf ppf ",@ ") + print_one ppf t + + let rec union l l' = + match (l, l') with + | _, [] -> l + | [], _ -> l' + | (c1, c2) :: r, (c1', c2') :: r' -> + if CodePage.compare c1' (CodePage.succ c2) > 0 then (c1, c2) :: union r l' + else if CodePage.compare c1 (CodePage.succ c2) > 0 then + (c1', c2') :: union l r' + else if CodePage.compare c2' c2 > 0 then + union r ((CodePage.min c1 c1', c2') :: r') + else union ((CodePage.min c1 c1', c2) :: r) r' + + let rec inter l l' = + match (l, l') with + | _, [] -> [] + | [], _ -> [] + | (c1, c2) :: r, (c1', c2') :: r' -> + if CodePage.compare c1' c2 > 0 then inter r l' + else if CodePage.compare c1 c2' > 0 then inter l r' + else if CodePage.compare c2' c2 > 0 then + (CodePage.max c1 c1', c2) :: inter r l' + else (CodePage.max c1 c1', c2') :: inter l r' + + let rec diff l l' = + match (l, l') with + | _, [] -> l + | [], _ -> [] + | (c1, c2) :: r, (c1', c2') :: r' -> + if CodePage.compare c1' c2 > 0 then (c1, c2) :: diff r l' + else if CodePage.compare c1 c2' > 0 then diff l r' + else + let r'' = + if CodePage.compare c2 c2' > 0 then (CodePage.succ c2', c2) :: r + else r + in + if CodePage.compare c1' c1 > 0 then + (c1, CodePage.pred c1') :: diff r'' r' + else diff r'' r' + + let single c = [ (c, c) ] + let csingle letter = single @@ CodePage.from_letter letter + let add c s = union (single c) s + let seq c c' = if CodePage.compare c c' <= 0 then [ (c, c') ] else [ (c', c) ] + + let offset ofs = + List.fold_left ~init:[] ~f:(fun acc (c1, c2) -> + union acc (seq (CodePage.offset ofs c1) (CodePage.offset ofs c2))) + + let empty = [] + let union_all : t list -> t = List.fold_left ~init:empty ~f:union + let intersect_all : t list -> t = List.fold_left ~init:cany ~f:inter + + let rec mem (c : cp) cset = + match cset with + | [] -> false + | (c1, c2) :: rem -> + if CodePage.compare c2 c >= 0 then CodePage.compare c c1 >= 0 + else mem c rem + + (****) + + let rec hash_rec = function + | [] -> 0 + | (i, j) :: r -> + CodePage.to_int i + (13 * CodePage.to_int j) + (257 * hash_rec r) + + let hash l = hash_rec l land 0x3fffffff + + (****) + + let to_dyn t = + let open Dyn in + match t with + | [ (x, y) ] when CodePage.equal x y -> int @@ CodePage.to_int x + | _ -> + List.map t ~f:(fun (x, y) -> + pair (int @@ CodePage.to_int x) (int @@ CodePage.to_int y)) + |> list + + let rec iter t ~f = + match t with + | [] -> () + | (x, y) :: xs -> + f x y; + iter xs ~f + + let one_c = function + | [ (i, j) ] when CodePage.equal i j -> Some i + | _ -> None + + module CSetMap = Map.Make (struct + type t = int * (cp * cp) list + + let compare (i, u) (j, v) = + let c = Int.compare i j in + if c <> 0 then c else compare u v + end) + + let fold_left t ~init ~f = + List.fold_left ~f:(fun acc (x, y) -> f acc x y) t ~init + + let fold_right t ~init ~f = + List.fold_right ~f:(fun (x, y) acc -> f x y acc) t ~init + + let is_empty = function [] -> true | _ -> false + + let rec prepend s x l = + match (s, l) with + | [], _ -> l + | _r, [] -> [] + | (_c, c') :: r, ([ (d, _d') ], _x') :: _r' when CodePage.compare d c' > 0 + -> + prepend r x l + | (c, c') :: r, ([ (d, d') ], x') :: r' -> + if CodePage.compare d c >= 0 then + if CodePage.compare d c' > 0 then + ([ (d, c') ], x @ x') + :: prepend r x (([ (CodePage.succ c', d') ], x') :: r') + else ([ (d, d') ], x @ x') :: prepend s x r' + else if CodePage.compare d' c > 0 then ([ (d, d') ], x') :: prepend s x r' + else + ([ (d, CodePage.pred c) ], x') :: prepend s x (([ (c, d') ], x') :: r') + | _ -> assert false + + let pick = function [] -> invalid_arg "Re_cset.pick" | (x, _) :: _ -> x + + let cseq u u' = + inter cany @@ seq (CodePage.from_letter u) (CodePage.from_letter u') + + (* let rg = cseq *) + (* let uchar = csingle *) + (* let cadd c s = add (of_uchar c) s *) + + (* simple case mapping implemented. + see https://www.unicode.org/versions/Unicode17.0.0/core-spec/chapter-3/#G33992 + TODO: full case folding ? *) + let case_insens s = + let l = ref [] in + iter s ~f:(fun cp1 cp2 -> + let i = CodePage.to_int cp1 in + let j = CodePage.to_int cp2 in + for n = i to j do + List.iter ~f:(fun cp -> l := single (CodePage.of_int cp) :: !l) + @@ Unicode.get_simple_case_folding n + done); + union_all (s :: !l) + + let notnl = diff cany nl + + let set str = + Codec.fold_left (fun acc letter -> union acc (csingle letter)) empty str +end + +module Utf8 : T with type letter = Uchar.t = Make (Uucodecs.Utf8) (UcharCp) + +module Utf16be : T with type letter = Uchar.t = + Make (Uucodecs.Utf16be) (UcharCp) + +module Utf16le : T with type letter = Uchar.t = + Make (Uucodecs.Utf16le) (UcharCp) + +module CharCp : CodePage with type letter = char = struct + type t = int + type letter = Char.t + + let to_int = Fun.id + let of_int = Fun.id + let to_letter i = Char.chr i + let from_letter c = Char.code c + let of_char = Fun.id + let to_char = Fun.id + let equal = fun cp cp' -> Int.equal (of_int cp) (of_int cp') + let compare = fun cp cp' -> Int.compare (of_int cp) (of_int cp') + let max = Int.max + let min = Int.min + let max_t = 255 + let min_t = 0 + let succ = succ + let pred = pred + let offset = fun ofs cp -> cp + ofs + let null = -1 + let pp ppf t = Format.fprintf ppf "%d" t + + module Categories = struct + let cany = [ (0x00, 0xFF) ] + let ascii = [ (0x00, 0x7F) ] + let cdigit = [ (0x30, 0x39) ] + let cupper = [ (0x41, 0x5A) ] + let upper = [ (0x41, 0x5A); (0xC0, 0xD6); (0xD8, 0xDE) ] + let clower = [ (0x61, 0x7A) ] + let space = [ (0x09, 0x0D); (0x32, 0x32) ] + let xdigit = [ (0x30, 0x39); (0x41, 0x46); (0x61, 0x66) ] + + let calpha = + [ + (0x41, 0x5A); + (0x61, 0x7A); + (0xAA, 0xAA); + (0xB5, 0xB5); + (0xBA, 0xBA); + (0xC0, 0xD6); + (0xD8, 0xDE); + (0xDF, 0xDF); + (0xFF, 0xFF); + ] + + let calnum = + [ + (0x30, 0x39); + (0x41, 0x5A); + (0x61, 0x7A); + (0xAA, 0xAA); + (0xB5, 0xB5); + (0xBA, 0xBA); + (0xC0, 0xD6); + (0xD8, 0xDE); + (0xDF, 0xDF); + (0xFF, 0xFF); + ] + + let cword = + [ + (0x30, 0x39); + (0x41, 0x5A); + (0x5F, 0x5F); + (0x61, 0x7A); + (0xAA, 0xAA); + (0xB5, 0xB5); + (0xBA, 0xBA); + (0xC0, 0xD6); + (0xD8, 0xDE); + (0xDF, 0xDF); + (0xFF, 0xFF); + ] + + let nl = [ (0x0A, 0x0A) ] + + (* CR-someday rgrinberg: this [lower] doesn't match [clower] *) + let lower = [ (0x61, 0x7A); (0xB5, 0xB5); (0xDF, 0xF6); (0xF8, 0xFF) ] + + let alpha = + [ + (0x41, 0x5A); + (0x61, 0x7A); + (0xAA, 0xAA); + (0xB5, 0xB5); + (0xBA, 0xBA); + (0xC0, 0xD6); + (0xD8, 0xF6); + (0xF8, 0xFF); + ] + + let alnum = + [ + (0x30, 0x39); + (0x41, 0x5A); + (0x61, 0x7A); + (0xAA, 0xAA); + (0xB5, 0xB5); + (0xBA, 0xBA); + (0xC0, 0xD6); + (0xD8, 0xF6); + (0xF8, 0xFF); + ] + + let wordc = + [ + (0x30, 0x39); + (0x41, 0x5A); + (0x5F, 0x5F); + (0x61, 0x7A); + (0xAA, 0xAA); + (0xB5, 0xB5); + (0xBA, 0xBA); + (0xC0, 0xD6); + (0xD8, 0xF6); + (0xF8, 0xFF); + ] + + let cntrl = [ (0x00, 0x1F); (0x7F, 0x9F) ] + let graph = [ (0x21, 0x7E); (0xA0, 0xFF) ] + let print = [ (0x20, 0x7E); (0xA0, 0xFF) ] + + let punct = + [ + (0x21, 0x2F); + (0x3A, 0x40); + (0x5B, 0x60); + (0x7B, 0x7E); + (0xA0, 0xA9); + (0xAB, 0xB4); + (0xB6, 0xB9); + (0xBB, 0xBF); + (0xD7, 0xD7); + (0xF7, 0xF7); + ] + + let blank = [ (0x0009, 0x0009); (0x0020, 0x0020) ] + end +end + +module Latin1 : T with type letter = Char.t = Make (Uucodecs.Latin1) (CharCp) diff --git a/lib/unicode/cset.mli b/lib/unicode/cset.mli new file mode 100644 index 00000000..3f2ae8cc --- /dev/null +++ b/lib/unicode/cset.mli @@ -0,0 +1,147 @@ +(* + RE - A regular expression library + + Copyright (C) 2001 Jerome Vouillon + email: Jerome.Vouillon@pps.jussieu.fr + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation, with + linking exception; either version 2.1 of the License, or (at + your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +*) + +(* Character sets, represented as sorted list of intervals *) + +module type Categories = sig + type c + + val cany : (c * c) list + val ascii : (c * c) list + val cdigit : (c * c) list + val cupper : (c * c) list + val clower : (c * c) list + val cword : (c * c) list + val calpha : (c * c) list + val calnum : (c * c) list + val xdigit : (c * c) list + val lower : (c * c) list + val upper : (c * c) list + val alpha : (c * c) list + val alnum : (c * c) list + val wordc : (c * c) list + val nl : (c * c) list + val blank : (c * c) list + val space : (c * c) list + val cntrl : (c * c) list + val graph : (c * c) list + val print : (c * c) list + val punct : (c * c) list +end + +module type CodePage = sig + type t [@@immediate] + type letter + + val to_letter : t -> letter + val from_letter : letter -> t + + (** special characters which isn't present in any set (not even in [cany]) *) + val null : t + + val equal : t -> t -> bool + val compare : t -> t -> int + val succ : t -> t + val pred : t -> t + val max : t -> t -> t + val min : t -> t -> t + val max_t : t + val min_t : t + val offset : int -> t -> t + val to_int : t -> int + val of_int : int -> t + val of_char : char -> letter + val to_char : letter -> char + val pp : Format.formatter -> t -> unit + + module Categories : Categories with type c := t +end + +module type T = sig + type cp [@@immediate] + type letter + type t + + module CodePage : CodePage with type t := cp and type letter := letter + module Codec : Uucodecs.T with type letter = letter + + val equal : t -> t -> bool + val iter : t -> f:(cp -> cp -> unit) -> unit + val union : t -> t -> t + val union_all : t list -> t + val intersect_all : t list -> t + val inter : t -> t -> t + val diff : t -> t -> t + val empty : t + val single : cp -> t + val add : cp -> t -> t + val mem : cp -> t -> bool + val case_insens : t -> t + val cany : t + val ascii : t + val cdigit : t + val cupper : t + val clower : t + val calpha : t + val cword : t + val notnl : t + val nl : t + val cseq : letter -> letter -> t + val set : string -> t + val blank : t + val space : t + val xdigit : t + val lower : t + val upper : t + val alpha : t + val alnum : t + val wordc : t + val cntrl : t + val graph : t + val print : t + val punct : t + val pp : Format.formatter -> t -> unit + val one_c : t -> cp option + val fold_left : t -> init:'acc -> f:('acc -> cp -> cp -> 'acc) -> 'acc + val fold_right : t -> init:'acc -> f:(cp -> cp -> 'acc -> 'acc) -> 'acc + val hash : t -> int + val compare : t -> t -> int + + module CSetMap : Map.S with type key = int * t + + val csingle : letter -> t + val is_empty : t -> bool + val prepend : t -> 'a list -> (t * 'a list) list -> (t * 'a list) list + val pick : t -> cp + val offset : int -> t -> t + val to_dyn : t -> Dyn.t +end + +module Make : functor + (Codec : Uucodecs.T) + (_ : CodePage with type letter = Codec.letter) + -> T with type letter = Codec.letter + +module Utf8 : T with type letter = Uchar.t +module Utf16be : T with type letter = Uchar.t +module Utf16le : T with type letter = Uchar.t +module Latin1 : T with type letter = Char.t diff --git a/lib/unicode/dune b/lib/unicode/dune new file mode 100644 index 00000000..26363242 --- /dev/null +++ b/lib/unicode/dune @@ -0,0 +1,21 @@ +(rule + (targets unicode.ml unicode.mli) + (mode fallback) + (deps + (:in gen/data/ucd.all.grouped.xml) + (:gen gen/gen_unicode.exe)) + (action + (run %{gen} -i %{in}))) + +(library + (name re_unicode) + (public_name re.unicode) + (synopsis "Pure OCaml unicode regular expression library") + (libraries + uunf + uucp)) + +(copy_files# + (enabled_if + (< %{ocaml_version} 5)) + (files ../fake/*)) diff --git a/lib/unicode/dyn.ml b/lib/unicode/dyn.ml new file mode 100644 index 00000000..b0969df0 --- /dev/null +++ b/lib/unicode/dyn.ml @@ -0,0 +1,28 @@ +type t = + | Int of int + | Tuple of t list + | Enum of string + | String of string + | Array of t array + | List of t list + | Variant of string * t list + | Record of (string * t) list + +let variant x y = Variant (x, y) +let list x = List x +let array x = Array x +let int x = Int x +let pair x y = Tuple [ x; y ] +let record fields = Record fields +let enum x = Enum x +let string s = String s + +let result ok err = function + | Ok s -> variant "Ok" [ ok s ] + | Error e -> variant "Error" [ err e ] +;; + +let option f = function + | None -> enum "None" + | Some s -> variant "Some" [ f s ] +;; diff --git a/lib/unicode/emacs.ml b/lib/unicode/emacs.ml new file mode 100644 index 00000000..36a729e9 --- /dev/null +++ b/lib/unicode/emacs.ml @@ -0,0 +1,167 @@ +(* + RE - A regular expression library + + Copyright (C) 2001 Jerome Vouillon + email: Jerome.Vouillon@pps.jussieu.fr + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation, with + linking exception; either version 2.1 of the License, or (at + your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +*) + +exception Parse_error +exception Not_supported + +module type T = sig + type core + type re + + (** Parsing of an Emacs-style regular expression *) + val re : ?case:bool -> string -> core + + val re_result : + ?case:bool -> string -> (core, [ `Not_supported | `Parse_error ]) result + + (** Regular expression compilation *) + val compile : core -> re + + (** Same as [Core.compile] *) + val compile_pat : ?case:bool -> string -> re + + val re_no_emacs : case:bool -> string -> core +end + +module Make (Cset : Cset.T) + (Color_map : + Color_map.T + with type cp = Cset.cp + and type letter = Cset.letter + and type cset_t = Cset.t) = struct + + module Re = Core.Make(Cset)(Color_map) + + type core = Re.t + type re = Re.re + + module Parse_buffer = Parse_buffer.Make (Cset) + + module CodePage = Cset.CodePage + module Categories = CodePage.Categories + + let ( !! ) = CodePage.of_char + let ( !^ ) = fun x -> CodePage.from_letter @@ CodePage.of_char x + + let by_code (f : int -> int -> int) c c' = + let c = CodePage.(from_letter c |> to_int) in + let c' = CodePage.(from_letter c' |> to_int) in + CodePage.to_letter (f c c' |> CodePage.of_int) + + let parse ~emacs_only s = + let buf = Parse_buffer.create s in + let accept = Parse_buffer.accept buf in + let eos () = Parse_buffer.eos buf in + let test2 = Parse_buffer.test2 buf in + let get () = Parse_buffer.get buf in + let rec regexp () = regexp' [ branch () ] + and regexp' left = + if Parse_buffer.accept_s buf {|\||} then regexp' (branch () :: left) + else Re.alt (List.rev left) + and branch () = branch' [] + and branch' left = + if eos () || test2 !!'\\' !!'|' || test2 !!'\\' !!')' then + Re.seq (List.rev left) + else branch' (piece () :: left) + and piece () = + let r = atom () in + if accept !!'*' then Re.rep r + else if accept !!'+' then Re.rep1 r + else if accept !!'?' then Re.opt r + else r + and atom () : Re.t = + if accept !!'.' then Re.notnl + else if accept !!'^' then Re.bol + else if accept !!'$' then Re.eol + else if accept !!'[' then + if accept !!'^' then Re.compl (bracket []) else Re.alt (bracket []) + else if accept !!'\\' then + if accept !!'(' then ( + let r = regexp () in + if not (Parse_buffer.accept_s buf {|\)|}) then raise Parse_error; + Re.group r) + else if emacs_only && accept !!'`' then Re.bos + else if emacs_only && accept !!'\'' then Re.eos + else if accept !!'=' then Re.start + else if accept !!'b' then Re.alt [ Re.bow; Re.eow ] + else if emacs_only && accept !!'B' then Re.not_boundary + else if emacs_only && accept !!'<' then Re.bow + else if emacs_only && accept !!'>' then Re.eow + else if accept !!'w' then Re.alt [ Re.alnum; Re.letter !!'_' ] + else if accept !!'W' then Re.compl [ Re.alnum; Re.letter !!'_' ] + else ( + if eos () then raise Parse_error; + match CodePage.from_letter (get ()) with + | c + when CodePage.equal c !^'*' || CodePage.equal c !^'+' + || CodePage.equal c !^'?' || CodePage.equal c !^'[' + || CodePage.equal c !^']' || CodePage.equal c !^'.' + || CodePage.equal c !^'^' || CodePage.equal c !^'$' + || CodePage.equal c !^'\\' -> + Re.letter @@ CodePage.to_letter c + | c when Cset.mem c Cset.cdigit -> raise Not_supported + | c -> + if emacs_only then raise Parse_error + else Re.letter @@ CodePage.to_letter c) + else ( + if eos () then raise Parse_error; + match CodePage.from_letter (get ()) with + | c when CodePage.equal c !^'*' -> raise Parse_error + | c when CodePage.equal c !^'+' -> raise Parse_error + | c when CodePage.equal c !^'?' -> raise Parse_error + | c -> Re.letter @@ CodePage.to_letter c) + and bracket s = + if s <> [] && accept !!']' then s + else + let c = letter () in + if accept !!'-' then + if accept !!']' then Re.letter c :: Re.letter !!'-' :: s + else + let c' = letter () in + let c' = by_code Int.max c c' in + bracket (Re.rg c c' :: s) + else bracket (Re.letter c :: s) + and letter () = + if eos () then raise Parse_error; + get () + in + let res = regexp () in + if not (eos ()) then raise Parse_error; + res + + let re ?(case = true) s = + let r = parse s ~emacs_only:true in + if case then r else Re.no_case r + + let re_no_emacs ~case s = + let r = parse s ~emacs_only:false in + if case then r else Re.no_case r + + let re_result ?case s = + match re ?case s with + | s -> Ok s + | exception Not_supported -> Error `Not_supported + | exception Parse_error -> Error `Parse_error + + let compile = Re.compile + let compile_pat ?(case = true) s = compile (re ~case s) +end diff --git a/lib/unicode/emacs.mli b/lib/unicode/emacs.mli new file mode 100644 index 00000000..f9b99ec0 --- /dev/null +++ b/lib/unicode/emacs.mli @@ -0,0 +1,58 @@ +(* + RE - A regular expression library + + Copyright (C) 2001 Jerome Vouillon + email: Jerome.Vouillon@pps.jussieu.fr + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation, with + linking exception; either version 2.1 of the License, or (at + your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +*) + +(** Emacs-style regular expressions *) + +exception Parse_error + +(** Errors that can be raised during the parsing of the regular expression *) +exception Not_supported + +module type T = sig + type core + type re + + (** Parsing of an Emacs-style regular expression *) + val re : ?case:bool -> string -> core + + val re_result : + ?case:bool -> string -> (core, [ `Not_supported | `Parse_error ]) result + + (** Regular expression compilation *) + val compile : core -> re + + (** Same as [Core.compile] *) + val compile_pat : ?case:bool -> string -> re + + val re_no_emacs : case:bool -> string -> core +end + +module Make + (Cset : Cset.T) + (Color_map : + Color_map.T + with type cp = Cset.cp + and type letter = Cset.letter + and type cset_t = Cset.t) : + T + with type core = Core.Make(Cset)(Color_map).t + and type re = Core.Make(Cset)(Color_map).re diff --git a/lib/unicode/fmt.ml b/lib/unicode/fmt.ml new file mode 100644 index 00000000..a0d67483 --- /dev/null +++ b/lib/unicode/fmt.ml @@ -0,0 +1,51 @@ +(** Very small tooling for format printers. *) + +include Format + +type 'a t = Format.formatter -> 'a -> unit + +let list = pp_print_list +let array ?pp_sep f fmt arr = list ?pp_sep f fmt (Array.to_list arr) +let str = pp_print_string +let sexp fmt s pp x = fprintf fmt "@[<3>(%s@ %a)@]" s pp x +let bytes fmt t = Format.fprintf fmt "%S" (Bytes.to_string t) + +let pair pp1 pp2 fmt (v1, v2) = + pp1 fmt v1; + pp_print_space fmt (); + pp2 fmt v2 +;; + +let triple pp1 pp2 pp3 fmt (v1, v2, v3) = + pp1 fmt v1; + pp_print_space fmt (); + pp2 fmt v2; + pp_print_space fmt (); + pp3 fmt v3 +;; + +let opt f fmt x = + match x with + | None -> pp_print_string fmt "" + | Some x -> fprintf fmt "%a" f x +;; + +let int = pp_print_int + +let optint fmt = function + | None -> () + | Some i -> fprintf fmt "@ %d" i +;; + +let char fmt c = Format.fprintf fmt "%c" c +let bool = Format.pp_print_bool +let lit s fmt () = pp_print_string fmt s + +let to_to_string pp x = + let b = Buffer.create 16 in + let fmt = Format.formatter_of_buffer b in + pp fmt x; + Buffer.contents b +;; + +let quoted_string fmt s = Format.fprintf fmt "%S" s diff --git a/lib/unicode/fmt.mli b/lib/unicode/fmt.mli new file mode 100644 index 00000000..3dbadf46 --- /dev/null +++ b/lib/unicode/fmt.mli @@ -0,0 +1,18 @@ +type formatter := Format.formatter +type 'a t = formatter -> 'a -> unit + +val sexp : formatter -> string -> 'a t -> 'a -> unit +val str : string t +val optint : int option t +val opt : 'a t -> 'a option t +val char : char t +val bool : bool t +val int : int t +val pair : 'a t -> 'b t -> ('a * 'b) t +val triple : 'a t -> 'b t -> 'c t -> ('a * 'b * 'c) t +val list : ?pp_sep:unit t -> 'a t -> 'a list t +val bytes : Bytes.t t +val array : ?pp_sep:unit t -> 'a t -> 'a array t +val lit : string -> unit t +val to_to_string : 'a t -> 'a -> string +val quoted_string : string t diff --git a/lib/unicode/gen/data/dune b/lib/unicode/gen/data/dune new file mode 100644 index 00000000..edb33604 --- /dev/null +++ b/lib/unicode/gen/data/dune @@ -0,0 +1,23 @@ +(rule + (target url) + (deps (:gen ../gen_text_file.exe)) + (action + (run %{gen} -o %{target}))) + +(rule + (target ucd.all.grouped.zip) + (deps url) + (action + (run + curl + -L + -s + %{read:url}/%{target} + -o + %{target}))) + +(rule + (target ucd.all.grouped.xml) + (deps (:gen ../unzip_data.exe) (:in ucd.all.grouped.zip)) + (action + (run %{gen} -i %{in} -o %{target}))) diff --git a/lib/unicode/gen/dune b/lib/unicode/gen/dune new file mode 100644 index 00000000..20989739 --- /dev/null +++ b/lib/unicode/gen/dune @@ -0,0 +1,15 @@ + +(executable + (name gen_text_file) + (modules gen_text_file) + (libraries uucp)) + +(executable + (name unzip_data) + (modules unzip_data) + (libraries zip)) + +(executable + (name gen_unicode) + (modules gen_unicode) + (libraries uucd)) diff --git a/lib/unicode/gen/gen_text_file.ml b/lib/unicode/gen/gen_text_file.ml new file mode 100644 index 00000000..4d3aafde --- /dev/null +++ b/lib/unicode/gen/gen_text_file.ml @@ -0,0 +1,11 @@ +let usage_msg = "gen_text_file -o " +let output_file = ref "" +let speclist = [("-o", Arg.Set_string output_file, "Set output file name")] +let anon_fun = fun _ -> () + +let _ = + Arg.parse speclist anon_fun usage_msg; + let url = Printf.sprintf "https://www.unicode.org/Public/%s/ucdxml" Uucp.unicode_version in + let oc = open_out !output_file in + output_string oc url; + close_out oc diff --git a/lib/unicode/gen/gen_unicode.ml b/lib/unicode/gen/gen_unicode.ml new file mode 100644 index 00000000..6beadf03 --- /dev/null +++ b/lib/unicode/gen/gen_unicode.ml @@ -0,0 +1,603 @@ +let pf = Stdlib.Format.fprintf + +let unicode_version (t : Uucd.t) = + List.nth (String.split_on_char ' ' t.description) 1 + +type rng = { + mutable k_start : int; + mutable k_end : int; + mutable state : bool; + mutable ranges : (int * int) list; +} + +let mk_rng () = { k_start = 0; k_end = 0; state = false; ranges = [] } + +let update_rng rng f k props = + match (rng.state, f k props) with + | false, false -> () + | true, true -> rng.k_end <- k + | true, false -> + rng.ranges <- (rng.k_start, rng.k_end) :: rng.ranges; + rng.state <- false + | false, true -> + rng.k_start <- k; + rng.k_end <- k; + rng.state <- true + +let ranges f (t : Uucd.t) = + let rng = mk_rng () in + Uucd.Cpmap.iter (fun k props -> update_rng rng f k props) t.repertoire; + List.rev rng.ranges + +let cset_punctuation t = + let general_category = Uucd.general_category in + let f = + fun _ props -> + match Uucd.find props general_category with + | Some c -> ( + match c with + | `Pc | `Pd | `Ps | `Pe | `Pi | `Pf | `Po -> true + | _ -> false) + | _ -> false + in + ranges f t + +let cset_printable t = + let general_category = Uucd.general_category in + let f = + fun k props -> + match Uucd.find props general_category with + | Some c -> ( + match c with + (* letters *) + | `Lu | `Ll | `Lt | `Lm | `Lo + (* marks *) + | `Mn | `Mc | `Me + (* numbers *) + | `Nd | `Nl | `No + (* punctuation *) + | `Pc | `Pd | `Ps | `Pe | `Pi | `Pf | `Po + (* symbols *) + | `Sm | `Sc | `Sk | `So -> + true + (* spaces *) + | `Zs when k = 32 -> true + | _ -> false) + | _ -> false + in + ranges f t + +let cset_graphic t = + let general_category = Uucd.general_category in + let f = + fun _ props -> + match Uucd.find props general_category with + | Some c -> ( + match c with + (* letters *) + | `Lu | `Ll | `Lt | `Lm | `Lo + (* marks *) + | `Mn | `Mc | `Me + (* numbers *) + | `Nd | `Nl | `No + (* punctuation *) + | `Pc | `Pd | `Ps | `Pe | `Pi | `Pf | `Po + (* symbols *) + | `Sm | `Sc | `Sk | `So + (* spaces *) + | `Zs -> + true + | _ -> false) + | _ -> false + in + ranges f t + +let cset_control t = + let gc = Uucd.general_category in + let f = + fun _ props -> match Uucd.find props gc with Some `Cc -> true | _ -> false + in + ranges f t + +let cset_space t = + let ws = Uucd.white_space in + let f = + fun _ props -> match Uucd.find props ws with Some true -> true | _ -> false + in + ranges f t + +let cset_new_line t = + let line_break = Uucd.line_break in + let f = + fun _ props -> + match Uucd.find props line_break with + (* see https://www.unicode.org/reports/tr14/#LB5 hard line breaks + and https://www.unicode.org/reports/tr14/#BK mandatory Break.*) + | Some `CR | Some `LF | Some `NL | Some `BK -> true + | _ -> false + in + ranges f t + +let cset_hex_digit t = + let hex = Uucd.hex_digit in + let f = + fun _ props -> + match Uucd.find props hex with Some true -> true | _ -> false + in + ranges f t + +let cset_wordc t = + let alphabetic props = + match Uucd.find props Uucd.alphabetic with Some true -> true | _ -> false + in + let numeric_type props = + match Uucd.find props Uucd.numeric_type with + | Some `None | None -> false + | _ -> true + in + let is_underscore k = Int.equal k 0x005f in + let f = + fun k props -> alphabetic props || numeric_type props || is_underscore k + in + ranges f t + +let cset_alnum t = + let alphabetic props = + match Uucd.find props Uucd.alphabetic with Some true -> true | _ -> false + in + let numeric_type props = + match Uucd.find props Uucd.numeric_type with + | Some `None | None -> false + | _ -> true + in + let f = fun _ props -> alphabetic props || numeric_type props in + ranges f t + +let cset_alpha t = + let alphabetic = Uucd.alphabetic in + let f = + fun _ props -> + match Uucd.find props alphabetic with Some true -> true | _ -> false + in + ranges f t + +let cset_upper t = + let uppercase = Uucd.uppercase in + let f = + fun _ props -> + match Uucd.find props uppercase with Some true -> true | _ -> false + in + ranges f t + +let cset_lower t = + let lowercase = Uucd.lowercase in + let f = + fun _ props -> + match Uucd.find props lowercase with Some true -> true | _ -> false + in + ranges f t + +let cset_calpha t = + let gc = Uucd.general_category in + let f = + fun k props -> + match (k, Uucd.find props gc) with + | _, Some `Lu | _, Some `Ll | _, Some `Lt | _, Some `Lm | _, Some `Lo -> + true + | _ -> false + in + ranges f t + +let cset_cword t = + let alpha = Uucd.alphabetic in + let gc = Uucd.general_category in + let jcntl = Uucd.join_control in + let f = + fun _ props -> + match Uucd.find props alpha with + | Some true -> true + | _ -> + match Uucd.find props gc with + | Some `Nd -> true + | _ -> + match Uucd.find props jcntl with + | Some true -> true + | _ -> false + in + ranges f t + +(* OK*) +let cset_calnum t = + let gc = Uucd.general_category in + let f = + fun k props -> + match (k, Uucd.find props gc) with + | _, Some `Lu + | _, Some `Ll + | _, Some `Lt + | _, Some `Lm + | _, Some `Lo + | _, Some `Nd + | _, Some `Nl + | _, Some `No -> + true + | _ -> false + in + ranges f t + +let cset_clower t = + let gc = Uucd.general_category in + let f = + fun _ props -> match Uucd.find props gc with Some `Ll -> true | _ -> false + in + ranges f t + +let cset_cupper t = + let gc = Uucd.general_category in + let f = + fun _ props -> match Uucd.find props gc with Some `Lu -> true | _ -> false + in + ranges f t + +let cset_cdigit t = + let nt = Uucd.numeric_type in + let f = + fun _ props -> match Uucd.find props nt with Some `De -> true | _ -> false + in + ranges f t + +let sep_semicolon ppf () = pf ppf ";@ " + +let pp_ml_list pp_v ppf l = + let is_first = ref true in + let pp_values ppf l = + List.iter + (fun v -> + if !is_first then is_first := false else sep_semicolon ppf (); + pp_v ppf v) + l + in + pf ppf "[ %a ]" pp_values l + +let pp_ml_array pp_v ppf l = + let is_first = ref true in + let pp_values pf l = + List.iter + (fun v -> + if !is_first then is_first := false else sep_semicolon pf (); + pp_v pf v) + l + in + pf ppf "[| %a |]" pp_values l + +let pp_hex ppf i = pf ppf "0x%04X" i +let pp_range ppf (k0, k1) = pf ppf "(%a, %a)" pp_hex k0 pp_hex k1 +let pp_ranges ppf l = pf ppf "%a" (pp_ml_list pp_range) l + +let pp_doc ppf doc = + match String.equal doc "" with + | true -> () + | _ -> + let l = String.split_on_char ' ' doc in + let pp_sep ppf () = pf ppf "@ " in + pf ppf "@[(** %a *)@]" + (Format.pp_print_list ~pp_sep Format.pp_print_string) + l + +let pp_val_with_doc_intf ppf (intf, doc) = + pf ppf "@[%a@,@]@[%a@,@]" pp_doc doc intf () + +let pp_values_impl ppf l = + List.iter (fun impl -> pf ppf "@[@[@[%a@]@,@]@,@]" impl ()) l + +let pp_values_intf ppf l = + List.iter + (fun (intf, doc) -> pf ppf "@[%a@,@]" pp_val_with_doc_intf (intf, doc)) + l + +let pp_module_impl name ppf l = + pf ppf "@[@[@[module %s = struct@,@]@,@[%a@]@,@]@,@]end@]" + name pp_values_impl l + +let pp_module_intf name ppf l = + pf ppf "@[@[@[module %s : sig@,@]@,@[%a@]@,@]@,@]end@]" + name pp_values_intf l + +let pp_warning ppf () = + pf ppf "(* WARNING do not edit. This file was automatically generated. *)" + +let pp_unicode_version_impl (t : Uucd.t) ppf () = + let version = List.nth (String.split_on_char ' ' t.description) 1 in + pf ppf "let unicode_version = \"%s\"" version + +let pp_unicode_version_intf ppf () = pf ppf "val unicode_version : string" + +let pp_unicode_regexp_version_impl ppf () = + pf ppf "let unicode_regexp_version = \"23\"" + +let pp_unicode_regexp_version_intf ppf () = + pf ppf "val unicode_regexp_version : string" + +let pp_prop_impl (name, l) ppf () = pf ppf "let %s = %a" name pp_ranges l +let pp_prop_val_intf name ppf () = pf ppf "val %s : (int * int) list" name + +let properties t = + [ + ( "cdigit", + cset_cdigit t, + "Characters that are restricted to digits which can be used in a decimal \ + radix positional numeral system and which are encoded in the standard \ + in a contiguous ascending range 0..9." ); + ("cupper", cset_cupper t, "Characters with the Lu general category."); + ("clower", cset_clower t, "Characters with the Ll general category."); + ( "cword", + cset_cword t, + "Characters that have the general_category with [< `Lu | `Ll | `Lt | `Lm \ + | `Lo | `Nd | `Nl | `No ], plus underscore." ); + ( "calpha", + cset_calpha t, + "Characters that have the general_category with [< `Lu | `Ll | `Lt | `Lm \ + | `Lo ]." ); + ( "calnum", + cset_calnum t, + "Characters that have the general_category with [< `Lu | `Ll | `Lt | `Lm \ + | `Lo | `Nd | `Nl | `No ]." ); + ( "xdigit", + cset_hex_digit t, + "Characters commonly used for the representation of hexadecimal numbers, \ + plus their compatibility equivalents. Property hex_digit is equal to \ + true." ); + ( "lower", + cset_lower t, + Printf.sprintf + "Characters with the Lowercase property. Generated from: Ll + \ + Other_Lowercase. see {{: \ + https://www.unicode.org/versions/Unicode%s/core-spec/chapter-4/#G138691} \ + Chapter 4, Character Properties in Unicode}." + (unicode_version t) ); + ( "upper", + cset_upper t, + Printf.sprintf + "Characters with the Uppercase property. Generated from: Lu + \ + Other_Uppercase. see {{: \ + https://www.unicode.org/versions/Unicode%s/core-spec/chapter-4/#G138691} \ + Chapter 4, Character Properties in Unicode}." + (unicode_version t) ); + ( "alpha", + cset_alpha t, + "Characters with the Alphabetic property. The use of the contributory \ + Other_Alphabetic property in the derivation of the Alphabetic property \ + enables the inclusion of various combining marks, such as dependent \ + vowels in many Indic scripts, which function as basic elements to spell \ + out words of those writing systems. The Alphabetic property is used in \ + tooling which assigns default primary weights for characters, for \ + generation of the DUCET table used by the Unicode Collation Algorithm \ + (UCA). For more information, see see {{: \ + https://www.unicode.org/versions/Unicode17.0.0/core-spec/chapter-4/#G138691} \ + Chapter 4, Character Properties in Unicode}." ); + ( "alnum", + cset_alnum t, + "Characters that have the alphabetic and numeric_type properties." ); + ( "wordc", + cset_wordc t, + "Characters that have the alphabetic, the numeric_type properties plus \ + underscore." ); + ( "nl", + cset_new_line t, + "Characters that are considered as hard line breaks and can consist of \ + BK or a Newline Function (NLF) as described in {{: \ + https://www.unicode.org/versions/latest/core-spec/chapter-5/#G10213 } \ + Section 5.8, Newline Guidelines, of Unicode}. That means characters \ + with the line_break property matching [< `BK | `CR | `LF | `NL ]." ); + ( "space", + cset_space t, + "Spaces, separator characters and other control characters which should \ + be treated by programming languages as \"white space\" for the purpose \ + of parsing elements (i.e. property white_space equal to true). See {{: \ + https://www.unicode.org/reports/tr44/#White_Space} White_Space}" ); + ( "cntrl", + cset_control t, + "Characters that have their general_category value equal to `Cc (a C0 or \ + C1 control code)." ); + ( "graph", + cset_graphic t, + "graph is defined as a Graphic. Such characters include letters, marks, \ + numbers, punctuation, symbols, and spaces, from categories L, M, N, P, \ + S, Zs." ); + ( "print", + cset_printable t, + "Printable characters. Such characters include letters, marks, numbers, \ + punctuation, symbols, and the ASCII space character." ); + ( "punct", + cset_punctuation t, + "Punctuation character. Such characters include punctuation from general \ + category P." ); + ] + +let values t = + [ + ( pp_unicode_version_impl t, + pp_unicode_version_intf, + "[unicode_version] is the Unicode version supported by the library and \ + is matching {!Uucp.unicode_version}." ); + ( pp_unicode_regexp_version_impl, + pp_unicode_regexp_version_intf, + "The version of the Unicode Regular Expressions to which this library \ + tries to be compliant. See {{: \ + https://www.unicode.org/reports/tr18/#C0} Unicode Regular Expressions - \ + Conformance}." ); + ] + +let pp_binary_search_impl ppf () = + pf ppf "let binary_search comp i v =@."; + pf ppf " let rec loop start finish =@."; + pf ppf " if finish < start || start > finish then false@."; + pf ppf " else@."; + pf ppf " let m = (start + finish) / 2 in@."; + pf ppf " match comp i v.(m) with@."; + pf ppf " | 0 -> true@."; + pf ppf " | 1 -> loop (m + 1) finish@."; + pf ppf " | _ -> loop start (m - 1)@."; + pf ppf " in@."; + pf ppf " loop 0 (Array.length v - 1)@." + +let pp_compare_impl ppf () = + pf ppf "let compare i (min, max) =@."; + pf ppf " if i >= min && i <= max then 0 else if i <= min then -1 else 1@." + +let nfx_qc_prop = function + | `NFD -> Uucd.nfd_quick_check + | `NFC -> Uucd.nfc_quick_check + | `NFKD -> Uucd.nfkd_quick_check + | `NFKC -> Uucd.nfkc_quick_check + +let pp_nfx ppf nfx = + let nfx = + match nfx with + | `NFD -> "nfd" + | `NFC -> "nfc" + | `NFKD -> "nfkd" + | `NFKC -> "nfkc" + in + pf ppf "%s" nfx + +let pp_nfx_qc_data_impl nfx ppf t = + let uucd_prop = nfx_qc_prop nfx in + let f = + fun _ props -> + match Uucd.find props uucd_prop with + | None -> false + | Some `False | Some `Maybe -> false + | Some `True -> true + in + let cset = ranges f t in + pf ppf "let %a_qc_data = %a" pp_nfx nfx (pp_ml_array pp_range) cset + +let pp_all_nfx_qc_data_impl ppf t = + List.iter + (fun nfx -> pf ppf "@[%a@]@." (pp_nfx_qc_data_impl nfx) t) + [ `NFC; `NFD; `NFKC; `NFKD ] + +let pp_nfx_quick_check_impl ppf t = + pf ppf "@[%a@]@." pp_all_nfx_qc_data_impl t; + pf ppf "@[%a@]@." pp_compare_impl (); + pf ppf "@[%a@]@." pp_binary_search_impl (); + pf ppf "let nfx_quick_check flag u =@."; + pf ppf " let cp = Uchar.to_int u in@."; + pf ppf " let t = match flag with@."; + pf ppf " | `NFD -> nfd_qc_data@."; + pf ppf " | `NFC -> nfc_qc_data@."; + pf ppf " | `NFKD -> nfkd_qc_data@."; + pf ppf " | `NFKC -> nfkc_qc_data@."; + pf ppf " in@."; + pf ppf " binary_search compare cp t@." + +let pp_nfx_quick_check_intf ppf () = + pf ppf "val nfx_quick_check : Uunf.form -> Uchar.t -> bool" + +let simple_case_folding (t : Uucd.t) = + let tbl = Hashtbl.create 1026 in + let cf = Uucd.simple_case_folding in + Uucd.Cpmap.iter + (fun k props -> + match Uucd.find props cf with + | None | Some `Self -> () + | Some (`Cp cp) -> ( + try + Hashtbl.find tbl cp |> fun l -> + let l = + let l = if List.mem cp l then l else cp :: l in + if List.mem k l then l else k :: l + in + Hashtbl.replace tbl cp l + with Not_found -> Hashtbl.add tbl cp [ k; cp ])) + t.repertoire; + Hashtbl.fold (fun k l acc -> (k, l) :: acc) tbl [] + |> List.sort (fun (k1, _) (k2, _) -> Int.compare k1 k2) + +let pp_simple_case_fold ppf (k, l) = + pf ppf "@[%a, %a@]@ " pp_hex k (pp_ml_list pp_hex) l + +let pp_simple_case_folding_impl ppf t = + let a = simple_case_folding t in + pf ppf "let simple_case_folding = %a" (pp_ml_array pp_simple_case_fold) a + +let pp_get_simple_case_folding_impl ppf t = + pf ppf "@[%a@]@." pp_simple_case_folding_impl t; + pf ppf "let get_simple_case_folding cp =@."; + pf ppf " try@."; + pf ppf " let cp =@."; + pf ppf " match Uucp.Case.Fold.fold (Uchar.of_int cp) with@."; + pf ppf " | `Self -> cp@."; + pf ppf " | `Uchars [ cp'] -> Uchar.to_int cp'@."; + pf ppf " | _ -> raise Exit@."; + pf ppf " in@."; + pf ppf " let rec loop start finish =@."; + pf ppf " if finish < start || start > finish then []@."; + pf ppf " else@."; + pf ppf " let m = (start + finish) / 2 in@."; + pf ppf " match Int.compare cp (fst simple_case_folding.(m)) with@."; + pf ppf " | 0 -> snd simple_case_folding.(m)@."; + pf ppf " | 1 -> loop (m + 1) finish@."; + pf ppf " | _ -> loop start (m - 1)@."; + pf ppf " in@."; + pf ppf " loop 0 (Array.length simple_case_folding - 1)@."; + pf ppf " with Exit -> []@." + +let pp_get_simple_case_folding_intf ppf () = + pf ppf "val get_simple_case_folding : int -> int list" + +let pp_impl ppf t = + pf ppf "@[@[%a@]@,@[@]@." pp_warning (); + pf ppf "@[%a@]@." pp_values_impl + (List.map (fun (impl, _, _) -> impl) @@ values t); + pp_nfx_quick_check_impl ppf t; + pp_get_simple_case_folding_impl ppf t; + pf ppf "@[@[%a@]@,@]@." (pp_module_impl "Regexp") + @@ List.map (fun (name, l, _) -> pp_prop_impl (name, l)) + @@ properties t + +let pp_intf ppf t = + pf ppf "@[@[@[%a@]@,@]@,@]" pp_warning (); + pf ppf "@[%a@]@." pp_values_intf + (List.map (fun (_, intf, doc) -> (intf, doc)) @@ values t); + pf ppf "@[%a@]@." pp_nfx_quick_check_intf (); + pf ppf "@[%a@]@." pp_get_simple_case_folding_intf (); + pf ppf "@[%a@]@." (pp_module_intf "Regexp") + @@ List.map (fun (name, _, doc) -> (pp_prop_val_intf name, doc)) + @@ properties t + +let usage_msg = "gen_unicode -i " +let input_file = ref "" +let output = "unicode" +let speclist = [ ("-i", Arg.Set_string input_file, "Set input file name") ] +let anon_fun = fun _ -> () + +let ucd_or_die () = + try + let file_in = !input_file in + let ic = Stdlib.open_in file_in in + let d = Uucd.decoder (`Channel ic) in + match Uucd.decode d with + | `Ok db -> db + | `Error e -> + let (l0, c0), (l1, c1) = Uucd.decoded_range d in + Stdlib.Printf.eprintf "%s:%d.%d-%d.%d: %s\n%!" file_in l0 c0 l1 c1 e; + Stdlib.exit 1 + with Sys_error e -> + Stdlib.Printf.eprintf "%s\n%!" e; + Stdlib.exit 1 + +let _ = + Arg.parse speclist anon_fun usage_msg; + let ml = output ^ ".ml" in + let mli = output ^ ".mli" in + let ucd = ucd_or_die () in + let oc_impl = Stdlib.open_out ml in + let oc_intf = Stdlib.open_out mli in + let ppf_impl = Stdlib.Format.formatter_of_out_channel oc_impl in + let ppf_intf = Stdlib.Format.formatter_of_out_channel oc_intf in + pp_impl ppf_impl ucd; + pp_intf ppf_intf ucd; + Stdlib.close_out oc_intf; + Stdlib.close_out oc_impl diff --git a/lib/unicode/gen/unzip_data.ml b/lib/unicode/gen/unzip_data.ml new file mode 100644 index 00000000..01f4396a --- /dev/null +++ b/lib/unicode/gen/unzip_data.ml @@ -0,0 +1,16 @@ + +let usage_msg = "unzip_data -i -o " +let input_file = ref "" +let output_file = ref "" +let speclist = [("-i", Arg.Set_string input_file, "Set input file name"); ("-o", Arg.Set_string output_file, "Set output file name")] +let anon_fun = fun _ -> () + +let _ = + Arg.parse speclist anon_fun usage_msg; + let in_file = Zip.open_in !input_file in + let entry_name = Filename.basename !output_file in + let entry = Zip.find_entry in_file entry_name in + let oc = open_out_bin !output_file in + Zip.copy_entry_to_channel in_file entry oc; + close_out oc; + Zip.close_in in_file diff --git a/lib/unicode/glob.ml b/lib/unicode/glob.ml new file mode 100644 index 00000000..b64bc371 --- /dev/null +++ b/lib/unicode/glob.ml @@ -0,0 +1,399 @@ +(* + RE - A regular expression library + + Copyright (C) 2001 Jerome Vouillon + email: Jerome.Vouillon@pps.jussieu.fr + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation, with + linking exception; either version 2.1 of the License, or (at + your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +*) + +exception Parse_error = Parse_buffer.Parse_error + +module type T = sig + type core + + val glob : + ?anchored:bool -> + ?pathname:bool -> + ?match_backslashes:bool -> + ?period:bool -> + ?expand_braces:bool -> + ?double_asterisk:bool -> + string -> + core + + val glob_result : + ?anchored:bool -> + ?pathname:bool -> + ?match_backslashes:bool -> + ?period:bool -> + ?expand_braces:bool -> + ?double_asterisk:bool -> + string -> + (core, [ `Parse_error ]) result + + (** Same, but allows to choose whether dots at the beginning of a file name + need to be explicitly matched (true) or not (false) + + @deprecated Use [glob ~period]. *) + val glob' : ?anchored:bool -> bool -> string -> core + + (** This version of [glob] also recognizes the pattern \{..,..\} + + @deprecated Prefer [glob ~expand_braces:true]. *) + val globx : ?anchored:bool -> string -> core + + (** This version of [glob'] also recognizes the pattern \{..,..\} + + @deprecated Prefer [glob ~expand_braces:true ~period]. *) + val globx' : ?anchored:bool -> bool -> string -> core +end + +module Make + (Cset : Cset.T) + (Color_map : + Color_map.T + with type cp = Cset.cp + and type letter = Cset.letter + and type cset_t = Cset.t) = +struct + module Re = Core.Make (Cset) (Color_map) + module CodePage = Cset.CodePage + module Categories = CodePage.Categories + module Parse_buffer = Parse_buffer.Make (Cset) + + type core = Re.t + type enclosed = Letter of Cset.letter | Range of Cset.letter * Cset.letter + + (* let pp_enclosed ppf l = + List.iter + (fun e -> + match e with + | Letter letter -> + Format.fprintf ppf "Letter: '%a' \n" Cset.CodePage.pp + (Cset.CodePage.from_letter letter) + | Range (letter, letter') -> + Format.fprintf ppf "Range: '%a'-'%a \n" Cset.CodePage.pp + (Cset.CodePage.from_letter letter) + Cset.CodePage.pp + (Cset.CodePage.from_letter letter')) + l *) + + type piece = + | Exactly of Cset.letter + | Any_of of enclosed list + | Any_but of enclosed list + | One + | Many + | ManyMany + + type t = piece list + + let ( !! ) = CodePage.of_char + let ( !^ ) = fun x -> CodePage.from_letter @@ CodePage.of_char x + + let of_string ~double_asterisk s : t = + let buf = Parse_buffer.create s in + let eos () = Parse_buffer.eos buf in + let read c = Parse_buffer.accept buf c in + let letter () = + ignore (read !!'\\' : bool); + if eos () then raise Parse_error; + Parse_buffer.get buf + in + let enclosed () : enclosed list = + let rec loop s = + (* This returns the list in reverse order, but order isn't important + anyway *) + if s <> [] && read !!']' then s + else + let l = letter () in + if not (read !!'-') then loop (Letter l :: s) + else if read !!']' then Letter l :: Letter !!'-' :: s + else + let l' = letter () in + loop (Range (l, l') :: s) + in + loop [] + in + let piece acc = + if double_asterisk && Parse_buffer.accept_s buf "/**" then + ManyMany + :: + (if eos () then ( + Exactly !!'/' :: acc) + else acc) + else if read !!'*' then + (if double_asterisk && read !!'*' then ManyMany else Many) :: acc + else if read !!'?' then One :: acc + else if not (read !!'[') then Exactly (letter ()) :: acc + else if read !!'^' || read !!'!' then Any_but (enclosed ()) :: acc + else Any_of (enclosed ()) :: acc + in + let rec loop pieces = + if eos () then List.rev pieces else loop (piece pieces) + in + loop [] + + let mul l l' = + List.flatten (List.map (fun s -> List.map (fun s' -> s ^ s') l') l) + + let explode str = + let len = String.length str in + let rec expl inner s pos acc beg max = + if pos >= max then ( + if inner then raise Parse_error; + (mul beg [ String.sub str s (pos - s) ], pos)) + else + let bytes, pos_next = + Cset.Codec.Unsafe.unsafe_bytes_with_next_pos str pos + in + match Cset.CodePage.from_letter @@ Cset.Codec.from_bytes bytes with + | c when CodePage.equal c !^'\\' -> + expl inner s (pos + (2 * (pos_next - pos))) acc beg max + | c when CodePage.equal c !^'{' -> + let t, i' = expl true pos_next pos_next [] [ "" ] max in + expl inner i' i' acc + (mul beg (mul [ String.sub str s (pos - s) ] t)) + max + | c when CodePage.equal c !^',' && inner -> + expl inner pos_next pos_next + (mul beg [ String.sub str s (pos - s) ] @ acc) + [ "" ] max + | c when CodePage.equal c !^'}' && inner -> + (mul beg [ String.sub str s (pos - s) ] @ acc, pos_next) + | _ -> expl inner s pos_next acc beg max + in + List.rev (fst (expl false 0 0 [] [ "" ] len)) + + module State = struct + type t = { + re_pieces : Re.t list (* last piece at head of list. *); + remaining : piece list (* last piece at tail of list. *); + am_at_start_of_pattern : bool; (* true at start of pattern *) + am_at_start_of_component : bool; + (* true at start of pattern or immediately + after '/' *) + pathname : bool; + match_backslashes : bool; + period : bool; + } + + let create ~period ~pathname ~match_backslashes remaining = + { + re_pieces = []; + am_at_start_of_pattern = true; + am_at_start_of_component = true; + pathname; + match_backslashes; + period; + remaining; + } + + let explicit_period t = + t.period + && (t.am_at_start_of_pattern || (t.am_at_start_of_component && t.pathname)) + + let explicit_slash t = t.pathname + + let slashes t = + if t.match_backslashes then ( + [ !!'/'; !!'\\' ]) + else ( + [ !!'/' ]) + + let append ?(am_at_start_of_component = false) t piece = + { + t with + re_pieces = piece :: t.re_pieces; + am_at_start_of_pattern = false; + am_at_start_of_component; + } + + let to_re t = Re.seq (List.rev t.re_pieces) + + let next t = + match t.remaining with + | [] -> None + | piece :: remaining -> Some (piece, { t with remaining }) + end + + let one ~explicit_slash ~slashes ~explicit_period = + Re.compl + (List.concat + [ + (if explicit_slash then List.map Re.letter slashes else []); + (if explicit_period then [ Re.letter !!'.' ] else []); + ]) + + let enclosed enclosed = + match enclosed with + | Letter l -> Re.letter l + | Range (low, high) -> Re.rg low high + + let enclosed_set ~explicit_slash ~slashes ~explicit_period kind set = + let set = List.map enclosed set in + let enclosure = + match kind with `Any_of -> Re.alt set | `Any_but -> Re.compl set + in + Re.inter [ enclosure; one ~explicit_slash ~slashes ~explicit_period ] + + let exactly state letter = + let slashes = State.slashes state in + let am_at_start_of_component = List.mem letter slashes in + let letters = if am_at_start_of_component then slashes else [ letter ] in + State.append state + (Re.alt (List.map Re.letter letters)) + ~am_at_start_of_component + + let many_many state = + let explicit_period = state.State.period && state.State.pathname in + let first_explicit_period = State.explicit_period state in + let slashes = State.slashes state in + let match_component ~explicit_period = + Re.seq + [ + one ~explicit_slash:true ~slashes ~explicit_period; + Re.rep (one ~explicit_slash:true ~slashes ~explicit_period:false); + ] + in + (* We must match components individually when [period] flag is set, + making sure to not match ["foo/.bar"]. *) + State.append state + (Re.seq + [ + Re.opt (match_component ~explicit_period:first_explicit_period); + Re.rep + (Re.seq + [ + Re.alt (List.map Re.letter slashes); + Re.opt (match_component ~explicit_period); + ]); + ]) + + let many (state : State.t) = + let explicit_slash = State.explicit_slash state in + let explicit_period = State.explicit_period state in + let slashes = State.slashes state in + (* Whether we must explicitly match period depends on the surrounding + characters, but slashes are easy to explicit match. This conditional + splits out some simple cases. *) + if not explicit_period then + State.append state + (Re.rep (one ~explicit_slash ~slashes ~explicit_period)) + else if not explicit_slash then + (* In this state, we explicitly match periods only at the very beginning *) + State.append state + (Re.opt + (Re.seq + [ + one ~explicit_slash:false ~slashes ~explicit_period; + Re.rep + (one ~explicit_slash:false ~slashes ~explicit_period:false); + ])) + else + let not_empty = + Re.seq + [ + one ~explicit_slash:true ~slashes ~explicit_period:true; + Re.rep (one ~explicit_slash:true ~slashes ~explicit_period:false); + ] + in + (* [maybe_empty] is the default translation of Many, except in some special + cases. *) + let maybe_empty = Re.opt not_empty in + let enclosed_set state kind set = + State.append state + (Re.alt + [ + enclosed_set kind set ~explicit_slash:true ~slashes + ~explicit_period:true; + Re.seq + [ + not_empty; + (* Since [not_empty] matched, subsequent dots are not leading. *) + enclosed_set kind set ~explicit_slash:true ~slashes + ~explicit_period:false; + ]; + ]) + in + let rec lookahead state = + match State.next state with + | None -> State.append state maybe_empty + (* glob ** === glob * . *) + | Some (Many, state) -> lookahead state + | Some (Exactly c, state) -> + let state = + State.append state (if c = !!'.' then not_empty else maybe_empty) + in + exactly state c + (* glob *? === glob ?* *) + | Some (One, state) -> State.append state not_empty + | Some (Any_of enclosed, state) -> enclosed_set state `Any_of enclosed + | Some (Any_but enclosed, state) -> enclosed_set state `Any_but enclosed + (* * then ** === ** *) + | Some (ManyMany, state) -> many_many state + in + lookahead state + + let piece state piece = + let explicit_slash = State.explicit_slash state in + let explicit_period = State.explicit_period state in + let slashes = State.slashes state in + match piece with + | One -> State.append state (one ~explicit_slash ~slashes ~explicit_period) + | Many -> many state + | Any_of enclosed -> + State.append state + (enclosed_set `Any_of ~explicit_slash ~slashes ~explicit_period enclosed) + | Any_but enclosed -> + State.append state + (enclosed_set `Any_but ~explicit_slash ~slashes ~explicit_period + enclosed) + | Exactly c -> exactly state c + | ManyMany -> many_many state + + let glob ~pathname ~match_backslashes ~period glob = + let rec loop state = + match State.next state with + | None -> State.to_re state + | Some (p, state) -> loop (piece state p) + in + loop (State.create ~pathname ~match_backslashes ~period glob) + + let glob ?(anchored = false) ?(pathname = true) ?(match_backslashes = false) + ?(period = true) ?(expand_braces = false) ?(double_asterisk = true) s = + let to_re s = + let re = + glob ~pathname ~match_backslashes ~period (of_string ~double_asterisk s) + in + if anchored then Re.whole_string re else re + in + if expand_braces then Re.alt (List.map to_re (explode s)) else to_re s + + let glob_result ?anchored ?pathname ?match_backslashes ?period ?expand_braces + ?double_asterisk s = + match + glob ?anchored ?pathname ?match_backslashes ?period ?expand_braces + ?double_asterisk s + with + | re -> Ok re + | exception Parse_error -> Error `Parse_error + + let glob' ?anchored period s = glob ?anchored ~period s + let globx ?anchored s = glob ?anchored ~expand_braces:true s + let globx' ?anchored period s = glob ?anchored ~expand_braces:true ~period s +end diff --git a/lib/unicode/glob.mli b/lib/unicode/glob.mli new file mode 100644 index 00000000..8db4640c --- /dev/null +++ b/lib/unicode/glob.mli @@ -0,0 +1,109 @@ +(* + RE - A regular expression library + + Copyright (C) 2001 Jerome Vouillon + email: Jerome.Vouillon@pps.jussieu.fr + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation, with + linking exception; either version 2.1 of the License, or (at + your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +*) + +(** Shell-style regular expressions *) + +exception Parse_error + +(** Implements the semantics of shells patterns. The returned regular expression + is unanchored by default. + + Character '*' matches any sequence of characters and character '?' matches a + single character. A sequence '[...]' matches any one of the enclosed + characters. A sequence '[^...]' or '[!...]' matches any character *but* the + enclosed characters. A backslash escapes the following character. The last + character of the string cannot be a backslash. + + [anchored] controls whether the regular expression will only match entire + strings. Defaults to false. + + [pathname]: If this flag is set, match a slash in string only with a slash + in pattern and not by an asterisk ('*') or a question mark ('?') + metacharacter, nor by a bracket expression ('[]') containing a slash. + Defaults to true. + + [match_backslashes]: If this flag is set, a forward slash will also match a + backslash (useful when globbing Windows paths). Note that a backslash in the + pattern will continue to escape the following character. Defaults to + [false]. + + [period]: If this flag is set, a leading period in string has to be matched + exactly by a period in pattern. A period is considered to be leading if it + is the first character in string, or if both [pathname] is set and the + period immediately follows a slash. Defaults to true. + + If [expand_braces] is true, braced sets will expand into multiple globs, + e.g. a\{x,y\}b\{1,2\} matches axb1, axb2, ayb1, ayb2. As specified for bash, + brace expansion is purely textual and can be nested. Defaults to false. + + [double_asterisk]: If this flag is set, double asterisks ('**') will match + slash characters, even if [pathname] is set. The [period] flag still + applies. Default to true. *) + +module type T = sig + type core + + val glob : + ?anchored:bool -> + ?pathname:bool -> + ?match_backslashes:bool -> + ?period:bool -> + ?expand_braces:bool -> + ?double_asterisk:bool -> + string -> + core + + val glob_result : + ?anchored:bool -> + ?pathname:bool -> + ?match_backslashes:bool -> + ?period:bool -> + ?expand_braces:bool -> + ?double_asterisk:bool -> + string -> + (core, [ `Parse_error ]) result + + (** Same, but allows to choose whether dots at the beginning of a file name + need to be explicitly matched (true) or not (false) + + @deprecated Use [glob ~period]. *) + val glob' : ?anchored:bool -> bool -> string -> core + + (** This version of [glob] also recognizes the pattern \{..,..\} + + @deprecated Prefer [glob ~expand_braces:true]. *) + val globx : ?anchored:bool -> string -> core + + (** This version of [glob'] also recognizes the pattern \{..,..\} + + @deprecated Prefer [glob ~expand_braces:true ~period]. *) + val globx' : ?anchored:bool -> bool -> string -> core +end + +module Make + (Cset : Cset.T) + (Color_map : + Color_map.T + with type cp = Cset.cp + and type letter = Cset.letter + and type cset_t = Cset.t) : + T with type core = Core.Make(Cset)(Color_map).t diff --git a/lib/unicode/group.ml b/lib/unicode/group.ml new file mode 100644 index 00000000..88810400 --- /dev/null +++ b/lib/unicode/group.ml @@ -0,0 +1,103 @@ +(* Result of a successful match. *) +type t = + { (* Input string. Matched strings are substrings of s *) + s : string + (* Mapping from group indices to positions in gpos. group i has positions 2*i + - 1, 2*i + 1 in gpos. If the group wasn't matched, then its corresponding + values in marks will be -1,-1 *) + ; marks : Mark_infos.t + ; (* Marks positions. i.e. those marks created with Re.marks *) + pmarks : Pmark.Set.t + ; (* Group positions. Adjacent elements are (start, stop) of group match. + indexed by the values in marks. So group i in an re would be the substring: + + start = t.gpos.(marks.(2*i)) - 1 + stop = t.gpos.(marks.(2*i + 1)) - 1 *) + gpos : int array + ; (* Number of groups the regular expression contains. Matched or not *) + gcount : int + } + +let create s ~gcount ~gpos marks pmarks = { s; gcount; gpos; marks; pmarks } + +module Offset = struct + type t = int + + let absent = -1 + let is_present t = t >= 0 + let get_no_check t = t +end + +let start_offset t i = + let i = Mark_infos.start_offset t.marks i in + if Mark_infos.Offset.is_present i + then t.gpos.(Mark_infos.Offset.get_no_check i) + else Offset.absent +;; + +let stop_offset t i = + let i = Mark_infos.stop_offset t.marks i in + if Mark_infos.Offset.is_present i + then t.gpos.(Mark_infos.Offset.get_no_check i) + else Offset.absent +;; + +let offset_opt t i = + Mark_infos.offset t.marks i + |> Option.map (fun (start, stop) -> t.gpos.(start), t.gpos.(stop)) +;; + +let or_not_found = function + | None -> raise Not_found + | Some s -> s +;; + +let offset t i = offset_opt t i |> or_not_found + +let get_opt t i = + offset_opt t i |> Option.map (fun (p1, p2) -> String.sub t.s p1 (p2 - p1)) +;; + +let pmarks t = t.pmarks +let get t i = get_opt t i |> or_not_found +let start_opt subs i = offset_opt subs i |> Option.map fst +let start subs i = start_opt subs i |> or_not_found +let stop_opt subs i = offset_opt subs i |> Option.map snd +let stop subs i = stop_opt subs i |> or_not_found +let test t i = Mark_infos.test t.marks i +let get_opt t i = if test t i then Some (get t i) else None +let dummy_offset = -1, -1 + +let all_offset t = + let res = Array.make t.gcount dummy_offset in + Mark_infos.iteri t.marks ~f:(fun i start stop -> + let p1 = t.gpos.(start) in + let p2 = t.gpos.(stop) in + res.(i) <- p1, p2); + res +;; + +let dummy_string = "" + +let all t = + let res = Array.make t.gcount dummy_string in + Mark_infos.iteri t.marks ~f:(fun i start stop -> + let p1 = t.gpos.(start) in + let p2 = t.gpos.(stop) in + res.(i) <- String.sub t.s p1 (p2 - p1)); + res +;; + +let pp fmt t = + let matches = + let offsets = all_offset t in + let strs = all t in + Array.to_list (Array.init (Array.length strs) (fun i -> strs.(i), offsets.(i))) + in + let open Format in + let open Fmt in + let pp_match fmt (str, (start, stop)) = fprintf fmt "@[(%s (%d %d))@]" str start stop in + sexp fmt "Group" (list pp_match) matches +;; + +let nb_groups t = t.gcount diff --git a/lib/unicode/group.mli b/lib/unicode/group.mli new file mode 100644 index 00000000..2e0263a9 --- /dev/null +++ b/lib/unicode/group.mli @@ -0,0 +1,54 @@ +(** Information about groups in a match. *) + +(** Result of a successful match. *) +type t + +val create : string -> gcount:int -> gpos:int array -> Mark_infos.t -> Pmark.Set.t -> t + +(** Raise [Not_found] if the group did not match *) +val get : t -> int -> string + +(** Similar to {!get}, but returns an option instead of using an exception. *) +val get_opt : t -> int -> string option + +(** Raise [Not_found] if the group did not match *) +val offset : t -> int -> int * int + +val offset_opt : t -> int -> (int * int) option + +(** Return the start of the match. Raise [Not_found] if the group did not match. *) +val start : t -> int -> int + +val start_opt : t -> int -> int option + +(** Return the end of the match. Raise [Not_found] if the group did not match. *) +val stop : t -> int -> int + +val stop_opt : t -> int -> int option + +(** Return the empty string for each group which did not match *) +val all : t -> string array + +(** Return [(-1,-1)] for each group which did not match *) +val all_offset : t -> (int * int) array + +(** Test whether a group matched *) +val test : t -> int -> bool + +val pmarks : t -> Pmark.Set.t + +(** Returns the total number of groups defined - matched or not. + This function is experimental. *) +val nb_groups : t -> int + +val pp : t Fmt.t + +module Offset : sig + type t + + val is_present : t -> bool + val get_no_check : t -> int +end + +val start_offset : t -> int -> Offset.t +val stop_offset : t -> int -> Offset.t diff --git a/lib/unicode/hash_set.ml b/lib/unicode/hash_set.ml new file mode 100644 index 00000000..c49ca15c --- /dev/null +++ b/lib/unicode/hash_set.ml @@ -0,0 +1,155 @@ +open Import + +module Array = struct + type nonrec t = Bytes.t + + let words = 8 + let[@inline] length t = Bytes.length t / words + let[@inline] unsafe_get t i = Int64.to_int (Bytes.get_int64_ne t (i * words)) + let[@inline] unsafe_set t i x = Bytes.set_int64_ne t (i * words) (Int64.of_int x) + + let[@inline] make len x = + let t = Bytes.create (len * words) in + for i = 0 to length t - 1 do + unsafe_set t i x + done; + t + ;; + + let[@inline] make_absent len = Bytes.make (len * words) '\255' + let clear t = Bytes.fill t 0 (Bytes.length t) '\255' + + let fold_left t ~init ~f = + let init = ref init in + for i = 0 to length t - 1 do + init := f !init (unsafe_get t i) + done; + !init + ;; +end + +(* A specialized hash table that makes the following trade-offs: + - Open addresing. Bucketing is quite memory intensive and dune is already + a memory hog. + - No boxing for empty slots. We make use of the fact that id's are never + negative to achieve this. + - No saving of the hash. Recomputing the hash for id's is a no-op. +*) + +type nonrec table = + { mutable table : Array.t + ; mutable size : int + } + +type t = table Option.t ref + +let init t = + if Option.is_none !t then t := Option.some { size = 0; table = Array.make 0 (-1) }; + Option.get !t +;; + +let[@inline] should_grow t = + let slots = Array.length t.table in + slots = 0 || (t.size > 0 && slots / t.size < 2) +;; + +let absent = -1 + +let () = + let x = Array.make_absent 1 in + assert (Array.unsafe_get x 0 = absent) +;; + +let create () = ref Option.none + +let[@inline] index_of_offset slots index i = + let i = index + !i in + if i >= slots then i - slots else i +;; + +let clear t = + match !t with + | None -> () + | Some t -> + t.size <- 0; + Array.clear t.table +;; + +let add t x = + let hash = Int.hash x in + let slots = Array.length t.table in + let index = hash land (slots - 1) in + let inserting = ref true in + let i = ref 0 in + while !inserting do + let idx = index_of_offset slots index i in + let elem = Array.unsafe_get t.table idx in + if elem = absent + then ( + Array.unsafe_set t.table idx x; + inserting := false) + else incr i + done; + t.size <- t.size + 1 +;; + +let resize t = + let old_table = t.table in + let slots = Array.length old_table in + let table = Array.make_absent (if slots = 0 then 1 else slots lsl 1) in + t.table <- table; + for i = 0 to slots - 1 do + let elem = Array.unsafe_get old_table i in + if elem <> absent then add t elem + done +;; + +let add t x = + let t = init t in + if should_grow t then resize t; + add t x +;; + +let[@inline] is_empty t = + let t = !t in + if Option.is_none t + then true + else ( + let t = Option.get t in + t.size = 0) +;; + +let mem t x = + let t = !t in + if Option.is_none t || (Option.get t).size = 0 + then false + else ( + let t = Option.get t in + let hash = Int.hash x in + let slots = Array.length t.table in + let index = hash land (slots - 1) in + let i = ref 0 in + let found = ref false in + while (not !found) && !i < slots do + let idx = index_of_offset slots index i in + let elem = Array.unsafe_get t.table idx in + if Int.equal elem x + then found := true + else if Int.equal elem absent + then i := slots + else incr i + done; + !found) +;; + +let pp fmt t = + let { table; size } = init t in + let table = + Array.fold_left table ~init:[] ~f:(fun acc i -> if i = absent then acc else i :: acc) + |> List.rev + |> Stdlib.Array.of_list + in + let table fmt () = Fmt.sexp fmt "table" Fmt.(array int) table in + let size fmt () = Fmt.sexp fmt "size" Fmt.int size in + Format.fprintf fmt "%a@.%a@." table () size () +;; diff --git a/lib/unicode/hash_set.mli b/lib/unicode/hash_set.mli new file mode 100644 index 00000000..f2f82215 --- /dev/null +++ b/lib/unicode/hash_set.mli @@ -0,0 +1,8 @@ +type t + +val create : unit -> t +val is_empty : t -> bool +val add : t -> int -> unit +val mem : t -> int -> bool +val clear : t -> unit +val pp : t Fmt.t diff --git a/lib/unicode/import.ml b/lib/unicode/import.ml new file mode 100644 index 00000000..ea7bfd66 --- /dev/null +++ b/lib/unicode/import.ml @@ -0,0 +1,24 @@ +module List = Stdlib.ListLabels + +module Poly = struct + let equal = ( = ) + let compare = compare +end + +module Phys_equal = struct + let equal = ( == ) +end + +let ( = ) = Int.equal +let ( == ) = [ `Use_phys_equal ] +let ( < ) (x : int) (y : int) = x < y +let ( > ) (x : int) (y : int) = x > y +let min = Int.min +let max = Int.max +let compare = Int.compare + +module Int = struct + let[@warning "-32"] hash (x : int) = Hashtbl.hash x + + include Stdlib.Int +end diff --git a/lib/unicode/mark_infos.ml b/lib/unicode/mark_infos.ml new file mode 100644 index 00000000..f4a4251e --- /dev/null +++ b/lib/unicode/mark_infos.ml @@ -0,0 +1,54 @@ +open Import + +type t = int array + +let make marks = + let len = 1 + List.fold_left ~f:(fun ma (i, _) -> max ma i) ~init:(-1) marks in + let t = Array.make len (-1) in + List.iter ~f:(fun (i, v) -> t.(i) <- v) marks; + t +;; + +let test t i = if 2 * i >= Array.length t then false else t.(2 * i) <> -1 + +module Offset = struct + type t = int + + let is_present t = t >= 0 + let get_no_check t = t +end + +let start_offset t i = + let start_i = 2 * i in + if start_i + 1 >= Array.length t then -1 else t.(start_i) +;; + +let stop_offset t i = + let stop_i = (2 * i) + 1 in + if stop_i >= Array.length t then -1 else t.(stop_i) +;; + +let offset t i = + let start_i = 2 * i in + let stop_i = start_i + 1 in + if stop_i >= Array.length t + then None + else ( + let start = t.(start_i) in + if start = -1 + then None + else ( + let stop = t.(stop_i) in + Some (start, stop))) +;; + +let iteri t ~f = + for i = 0 to (Array.length t / 2) - 1 do + let idx = 2 * i in + let start = t.(idx) in + if start <> -1 + then ( + let stop = t.(idx + 1) in + f i start stop) + done +;; diff --git a/lib/unicode/mark_infos.mli b/lib/unicode/mark_infos.mli new file mode 100644 index 00000000..f729058f --- /dev/null +++ b/lib/unicode/mark_infos.mli @@ -0,0 +1,17 @@ +(** store mark information for groups in an array *) +type t + +val make : (int * int) list -> t +val offset : t -> int -> (int * int) option +val test : t -> int -> bool +val iteri : t -> f:(int -> int -> int -> unit) -> unit + +module Offset : sig + type t + + val is_present : t -> bool + val get_no_check : t -> int +end + +val start_offset : t -> int -> Offset.t +val stop_offset : t -> int -> Offset.t diff --git a/lib/unicode/parse_buffer.ml b/lib/unicode/parse_buffer.ml new file mode 100644 index 00000000..2c555bdd --- /dev/null +++ b/lib/unicode/parse_buffer.ml @@ -0,0 +1,135 @@ +exception Parse_error + +module type T = sig + type t + type letter + + val create : string -> t + val junk : t -> unit + val unget : t -> unit + val eos : t -> bool + val test : t -> letter -> bool * int + val test2 : t -> letter -> letter -> bool + val get : t -> letter + val accept : t -> letter -> bool + val accept_s : t -> string -> bool + val integer : t -> int option +end + +module Make (Cset : Cset.T) = struct + type letter = Cset.letter + type t = { str : string; mutable pos : int } + + let create str = { str; pos = 0 } + + let unget t = + if t.pos = 0 then () + else + let w = + try Cset.Codec.width_rev String.unsafe_get t.str (t.pos - 1) + with _ -> 1 + in + t.pos <- t.pos - w + + let junk t = + let w = + try Cset.Codec.width_rev String.unsafe_get t.str t.pos with _ -> 1 + in + t.pos <- t.pos + w + + let eos t = t.pos = String.length t.str + + let test t letter = + if not (eos t) then + let b, pos_next = + Cset.Codec.Unsafe.unsafe_bytes_with_next_pos t.str t.pos + in + let b' = Cset.Codec.to_bytes letter in + (Bytes.equal b b', pos_next) + else (false, 0) + + let test2 t letter letter' = + t.pos + 1 < String.length t.str + && + let r, pos_next = test t letter in + r + (* let b = Cset.CodePage.from_letter letter in + let cp'', pos_next = + Cset.Codec.Unsafe.unsafe_bytes_with_next_pos t.str t.pos + |> fun (bytes, pos_next) -> + (Cset.Codec.from_bytes bytes |> Cset.CodePage.from_letter, pos_next) + in + Cset.CodePage.equal cp'' cp *) + && + let b = Cset.Codec.to_bytes letter' in + let b' = Cset.Codec.Unsafe.unsafe_bytes t.str pos_next in + Bytes.equal b b' + (* let cp' = Cset.CodePage.from_letter letter' in + let cp'' = + Cset.Codec.Unsafe.unsafe_bytes t.str pos_next + |> Cset.Codec.from_bytes |> Cset.CodePage.from_letter + in + Cset.CodePage.equal cp'' cp' *) + + let accept t c = + let r, pos_next = test t c in + if r then t.pos <- pos_next; + r + + let get t = + let letter, pos_next = + Cset.Codec.Unsafe.unsafe_bytes_with_next_pos t.str t.pos + |> fun (bytes, pos_next) -> (Cset.Codec.from_bytes bytes, pos_next) + in + t.pos <- pos_next; + letter + + let accept_s t s' = + let len = String.length s' in + try + let rec iter max ofs = + if ofs >= max then ( + t.pos <- t.pos + len; + true) + else + let w = try Cset.Codec.width String.unsafe_get s' ofs with _ -> 1 in + let b = Cset.Codec.Unsafe.unsafe_slice s' ofs w in + let b' = Cset.Codec.Unsafe.unsafe_slice t.str (t.pos + ofs) w in + if Bytes.equal b b' then iter max (ofs + w) else false + (* let letter = + Cset.Codec.Unsafe.unsafe_slice s' ofs w |> Cset.Codec.from_bytes + in + let letter' = + Cset.Codec.Unsafe.unsafe_slice t.str (t.pos + ofs) w + |> Cset.Codec.from_bytes + in + if Cset.Codec.equal letter letter' then iter max (ofs + w) else false *) + in + iter len 0 + with _ -> false + + let zero = Cset.CodePage.(to_int @@ from_letter @@ of_char '0') + + let rec integer' t i = + if eos t then Some i + else + let cp = get t |> Cset.CodePage.from_letter in + match Cset.mem cp Cset.cdigit with + | true -> + let i' = (10 * i) + (Cset.CodePage.to_int cp - zero) in + if i' < i then raise Parse_error; + integer' t i' + | _ -> + unget t; + Some i + + let integer t = + if eos t then None + else + let cp = get t |> Cset.CodePage.from_letter in + match Cset.mem cp Cset.cdigit with + | true -> integer' t (Cset.CodePage.to_int cp - zero) + | _ -> + unget t; + None +end diff --git a/lib/unicode/parse_buffer.mli b/lib/unicode/parse_buffer.mli new file mode 100644 index 00000000..2fc0e7af --- /dev/null +++ b/lib/unicode/parse_buffer.mli @@ -0,0 +1,19 @@ + +exception Parse_error + +module type T = sig + type letter + type t + val create : string -> t + val junk : t -> unit + val unget : t -> unit + val eos : t -> bool + val test : t -> letter -> bool * int + val test2 : t -> letter -> letter -> bool + val get : t -> letter + val accept : t -> letter -> bool + val accept_s : t -> string -> bool + val integer : t -> int option +end + +module Make (Cset : Cset.T) : T with type letter = Cset.letter diff --git a/lib/unicode/pcre.ml b/lib/unicode/pcre.ml new file mode 100644 index 00000000..9a72e849 --- /dev/null +++ b/lib/unicode/pcre.ml @@ -0,0 +1,223 @@ +exception Parse_error = Perl.Parse_error +exception Not_supported = Perl.Not_supported + +type flag = [ `CASELESS | `MULTILINE | `ANCHORED | `DOTALL ] + +type split_result = + | Text of string + | Delim of string + | Group of int * string + | NoGroup + +module type T = sig + type core + type re + type groups + + (** [re ~flags s] creates the regexp [s] using the pcre syntax. *) + val re : ?flags:flag list -> string -> core + + val re_result : + ?flags:flag list -> + string -> + (core, [ `Not_supported | `Parse_error ]) result + + (** [re ~flags s] compiles the regexp [s] using the pcre syntax. *) + val regexp : ?flags:flag list -> string -> re + + (** [extract ~rex s] executes [rex] on [s] and returns the matching groups. *) + val extract : rex:re -> string -> string array + + (** Equivalent to {!Core.exec}. *) + val exec : rex:re -> ?pos:int -> string -> groups + + (** Equivalent to {!Core.Group.get}. *) + val get_substring : groups -> int -> string + + (** Return the names of named groups. *) + val names : re -> string array + + (** Return the first matched named group, or raise [Not_found]. Prefer to use + the non-raising version [get_named_substring_opt] *) + val get_named_substring : re -> string -> groups -> string + + (** Return the first matched named group, or raise [Not_found]. *) + val get_named_substring_opt : re -> string -> groups -> string option + + (** Equivalent to {!Core.Group.offset}. *) + val get_substring_ofs : groups -> int -> int * int + + (** Equivalent to {!Core.execp}. *) + val pmatch : rex:re -> string -> bool + + val substitute : rex:re -> subst:(string -> string) -> string -> string + val full_split : ?max:int -> rex:re -> string -> split_result list + val split : rex:re -> string -> string list + val quote : string -> string + + (** {2 Deprecated} *) + + type substrings = Group.t +end + +module Make + (Cset : Cset.T) + (Color_map : + Color_map.T + with type cp = Cset.cp + and type letter = Cset.letter + and type cset_t = Cset.t) = +struct + module Re = Core.Make (Cset) (Color_map) + + type core = Re.t + type re = Re.re + type groups = Re.Group.t + + module Perl = Perl.Make (Cset) (Color_map) + + let re ?(flags = []) pat = + let opts = + List.map + (function + | `CASELESS -> `Caseless + | `MULTILINE -> `Multiline + | `ANCHORED -> `Anchored + | `DOTALL -> `Dotall) + flags + in + Perl.re ~opts pat + + let re_result ?flags s = + match re ?flags s with + | s -> Ok s + | exception Not_supported -> Error `Not_supported + | exception Parse_error -> Error `Parse_error + + let regexp ?flags pat = Re.compile (re ?flags pat) + let extract ~rex s = Re.Group.all (Re.exec rex s) + let exec ~rex ?pos s = Re.exec rex ?pos s + let names rex = Re.group_names rex |> List.map fst |> Array.of_list + + let get_named_substring_opt rex name s = + let rec loop = function + | [] -> None + | (n, i) :: rem when n = name -> ( + match Re.Group.get_opt s i with None -> loop rem | Some _ as s -> s) + | _ :: rem -> loop rem + in + loop (Re.group_names rex) + + let get_substring_ofs s i = Re.Group.offset s i + let pmatch ~rex s = Re.execp rex s + + let substitute ~rex ~subst str = + let b = Buffer.create 1024 in + let rec loop pos on_match = + if Re.execp ~pos rex str then ( + let ss = Re.exec ~pos rex str in + let start, fin = Re.Group.offset ss 0 in + if on_match && start = pos && start = fin then ( + if + (* Empty match following a match *) + pos < String.length str + then ( + Buffer.add_char b str.[pos]; + loop (pos + 1) false)) + else + let pat = Re.Group.get ss 0 in + Buffer.add_substring b str pos (start - pos); + Buffer.add_string b (subst pat); + if start = fin then ( + if + (* Manually advance by one after an empty match *) + fin < String.length str + then ( + Buffer.add_char b str.[fin]; + loop (fin + 1) false)) + else loop fin true) + else Buffer.add_substring b str pos (String.length str - pos) + in + loop 0 false; + Buffer.contents b + + let split ~rex s = + let rec split accu start = + if start = String.length s then accu + else + match + let g = Re.exec rex s ~pos:start in + if Group.stop g 0 = start then Re.exec rex s ~pos:(start + 1) else g + with + | exception Not_found -> + String.sub s start (String.length s - start) :: accu + | g -> + let next = Group.stop g 0 in + split (String.sub s start (Group.start g 0 - start) :: accu) next + in + match Re.exec rex s ~pos:0 with + | g -> + List.rev + (if Group.start g 0 = 0 then split [] (Group.stop g 0) + else split [ String.sub s 0 (Group.start g 0) ] (Group.stop g 0)) + | exception Not_found -> if s = "" then [] else [ s ] + + (* From PCRE *) + let string_unsafe_sub s ofs len = + let r = Bytes.create len in + Bytes.unsafe_blit s ofs r 0 len; + Bytes.unsafe_to_string r + + let quote s = + let len = String.length s in + let buf = Bytes.create (len lsl 1) in + let pos = ref 0 in + for i = 0 to len - 1 do + match String.unsafe_get s i with + | ('\\' | '^' | '$' | '.' | '[' | '|' | '(' | ')' | '?' | '*' | '+' | '{') + as c -> + Bytes.unsafe_set buf !pos '\\'; + incr pos; + Bytes.unsafe_set buf !pos c; + incr pos + | c -> + Bytes.unsafe_set buf !pos c; + incr pos + done; + string_unsafe_sub buf 0 !pos + + let full_split ?(max = 0) ~rex s = + if String.length s = 0 then [] + else if max = 1 then [ Text s ] + else + let results = Re.split_full rex s in + let matches = + List.map + (function + | `Text s -> [ Text s ] + | `Delim d -> + let matches = Re.Group.all_offset d in + let delim = Re.Group.get d 0 in + Delim delim + :: + (let l = ref [] in + for i = 1 to Array.length matches - 1 do + l := + (if matches.(i) = (-1, -1) then NoGroup + else Group (i, Re.Group.get d i)) + :: !l + done; + List.rev !l)) + results + in + List.concat matches + + type substrings = Group.t + + let get_substring s i = Re.Group.get s i + + let get_named_substring rex name s = + match get_named_substring_opt rex name s with + | None -> raise Not_found + | Some s -> s +end diff --git a/lib/unicode/pcre.mli b/lib/unicode/pcre.mli new file mode 100644 index 00000000..99528938 --- /dev/null +++ b/lib/unicode/pcre.mli @@ -0,0 +1,77 @@ +(** NOTE: Only a subset of the PCRE spec is supported *) + +exception Parse_error +exception Not_supported + +type flag = [ `CASELESS | `MULTILINE | `ANCHORED | `DOTALL ] + +(** Result of a {!Pcre.full_split} *) +type split_result = + | Text of string (** Text part of splitted string *) + | Delim of string (** Delimiter part of splitted string *) + | Group of int * string + (** Subgroup of matched delimiter (subgroup_nr, subgroup_str) *) + | NoGroup (** Unmatched subgroup *) + +module type T = sig + type core + type re + type groups + + (** [re ~flags s] creates the regexp [s] using the pcre syntax. *) + val re : ?flags:flag list -> string -> core + + val re_result : + ?flags:flag list -> + string -> + (core, [ `Not_supported | `Parse_error ]) result + + (** [re ~flags s] compiles the regexp [s] using the pcre syntax. *) + val regexp : ?flags:flag list -> string -> re + + (** [extract ~rex s] executes [rex] on [s] and returns the matching groups. *) + val extract : rex:re -> string -> string array + + (** Equivalent to {!Core.exec}. *) + val exec : rex:re -> ?pos:int -> string -> groups + + (** Equivalent to {!Core.Group.get}. *) + val get_substring : groups -> int -> string + + (** Return the names of named groups. *) + val names : re -> string array + + (** Return the first matched named group, or raise [Not_found]. Prefer to use + the non-raising version [get_named_substring_opt] *) + val get_named_substring : re -> string -> groups -> string + + (** Return the first matched named group, or raise [Not_found]. *) + val get_named_substring_opt : re -> string -> groups -> string option + + (** Equivalent to {!Core.Group.offset}. *) + val get_substring_ofs : groups -> int -> int * int + + (** Equivalent to {!Core.execp}. *) + val pmatch : rex:re -> string -> bool + + val substitute : rex:re -> subst:(string -> string) -> string -> string + val full_split : ?max:int -> rex:re -> string -> split_result list + val split : rex:re -> string -> string list + val quote : string -> string + + (** {2 Deprecated} *) + + type substrings = Group.t +end + +module Make + (Cset : Cset.T) + (Color_map : + Color_map.T + with type cp = Cset.cp + and type letter = Cset.letter + and type cset_t = Cset.t) : + T + with type core = Core.Make(Cset)(Color_map).t + and type re = Core.Make(Cset)(Color_map).re + and type groups = Core.Make(Cset)(Color_map).Group.t diff --git a/lib/unicode/perl.ml b/lib/unicode/perl.ml new file mode 100644 index 00000000..7babce7f --- /dev/null +++ b/lib/unicode/perl.ml @@ -0,0 +1,401 @@ +(* + RE - A regular expression library + + Copyright (C) 2001 Jerome Vouillon + email: Jerome.Vouillon@pps.jussieu.fr + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation, with + linking exception; either version 2.1 of the License, or (at + your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +*) + +(* TODO: https://www.pcre.org/original/doc/html/pcrepattern.html + add in module Pcre: + - \p{}: a character with the xx property + - \P{} a character without the xx property + - \X + - Xan Any alphanumeric character + - Xps Any POSIX space character + - Xsp Any Perl space character + - Xwd Any Perl "word" characte +*) + +exception Parse_error = Parse_buffer.Parse_error +exception Not_supported + +type opt = + [ `Ungreedy | `Dotall | `Dollar_endonly | `Multiline | `Anchored | `Caseless ] + +module type T = sig + type core + type re + + (** Parsing of a Perl-style regular expression *) + val re : ?opts:opt list -> string -> core + + val re_result : + ?opts:opt list -> string -> (core, [ `Not_supported | `Parse_error ]) result + + (** (Same as [Re.compile]) *) + val compile : core -> re + + (** Regular expression compilation *) + val compile_pat : ?opts:opt list -> string -> re +end + +module Make + (Cset : Cset.T) + (Color_map : + Color_map.T + with type cp = Cset.cp + and type letter = Cset.letter + and type cset_t = Cset.t) = +struct + module CodePage = Cset.CodePage + module Categories = CodePage.Categories + module Posix_class = Posix_class.Make (Cset) (Color_map) + module Parse_buffer = Posix_class.Parse_buffer + module Re = Core.Make (Cset) (Color_map) + + module Ast = struct + include Ast + include Ast.Make (Cset) (Color_map) + end + + let ( !! ) = CodePage.of_char + let ( !^ ) = fun x -> CodePage.from_letter @@ CodePage.of_char x + + let acc_digits = + let rec loop base digits acc i = + match digits with + | [] -> acc + | d :: digits -> + let acc = acc + (d * i) in + let i = i * i in + loop base digits acc i + in + fun ~base ~digits -> loop base digits 0 1 + + let letter_of_int x = + try CodePage.(of_int x |> to_letter) + with _exn -> + raise Parse_error + + type elem = Letter of Cset.letter | Set of Re.t + + let letter_b = Letter !!'\008' + let letter_newline = Letter !!'\n' + let letter_cr = Letter !!'\r' + let letter_tab = Letter !!'\t' + let word_letter = [ Re.alnum; Re.letter !!'_' ] + let word = Set (Re.alt word_letter) + let not_word = Set (Re.alt word_letter) + let space = Set Re.space + let not_space = Set (Re.compl [ Re.space ]) + let digit = Set Re.digit + let not_digit = Set (Re.compl [ Re.digit ]) + let xdigit_lowercase = Cset.inter Cset.xdigit Cset.clower + let xdigit_uppercase = Cset.inter Cset.xdigit Cset.cupper + + let cname = + Cset.union Cset.calpha @@ Cset.single CodePage.(from_letter @@ of_char '_') + + let parse ~multiline ~dollar_endonly ~dotall ~ungreedy s = + let buf = Parse_buffer.create s in + let accept = Parse_buffer.accept buf in + let eos () = Parse_buffer.eos buf in + let test c = Parse_buffer.test buf c in + let unget () = Parse_buffer.unget buf in + let get () = Parse_buffer.get buf in + let greedy_mod r = + let gr = accept !!'?' in + let gr = if ungreedy then not gr else gr in + if gr then Re.non_greedy r else Re.greedy r + in + let rec regexp () = regexp' [ branch () ] + and regexp' left = + if accept !!'|' then regexp' (branch () :: left) + else Re.alt (List.rev left) + and branch () = branch' [] + and branch' left = + if eos () || (fst @@ test !!'|') || (fst @@ test !!')') then + Re.seq (List.rev left) + else branch' (piece () :: left) + and in_brace ~f ~init = + match accept !!'{' with + | false -> None + | true -> + let rec loop acc = + if accept !!'}' then acc + else + let acc = f acc in + loop acc + in + Some (loop init) + and piece () = + let r = atom () in + if accept !!'*' then greedy_mod (Re.rep r) + else if accept !!'+' then greedy_mod (Re.rep1 r) + else if accept !!'?' then greedy_mod (Re.opt r) + else if accept !!'{' then ( + match Parse_buffer.integer buf with + | Some i -> + let j = if accept !!',' then Parse_buffer.integer buf else Some i in + if not (accept !!'}') then raise Parse_error; + (match j with Some j when j < i -> raise Parse_error | _ -> ()); + greedy_mod (Re.repn r i j) + | None -> + unget (); + r) + else r + and atom () = + if accept !!'.' then if dotall then Re.any else Re.notnl + else if accept !!'(' then ( + if accept !!'?' then + if accept !!':' then ( + let r = regexp () in + if not (accept !!')') then raise Parse_error; + r) + else if accept !!'#' then comment () + else if accept !!'<' then ( + let name = name () in + let r = regexp () in + if not (accept !!')') then raise Parse_error; + Re.group ~name r) + else raise Parse_error + else + let r = regexp () in + if not (accept !!')') then raise Parse_error; + Re.group r) + else if accept !!'^' then if multiline then Re.bol else Re.bos + else if accept !!'$' then + if multiline then Re.eol else if dollar_endonly then Re.leol else Re.eos + else if accept !!'[' then + if accept !!'^' then Re.compl (bracket []) else Re.alt (bracket []) + else if accept !!'\\' then ( + (* XXX + - Back-references + - \cx (control-x), \ddd + *) + if eos () then raise Parse_error; + match CodePage.from_letter (get ()) with + | cp when CodePage.equal cp !^'w' -> + Re.alt [ Re.alnum; Re.letter !!'_' ] + | cp when CodePage.equal cp !^'W' -> + Re.compl [ Re.alnum; Re.letter !!'_' ] + | cp when CodePage.equal cp !^'s' -> Re.space + | cp when CodePage.equal cp !^'S' -> Re.compl [ Re.space ] + | cp when CodePage.equal cp !^'d' -> Re.digit + | cp when CodePage.equal cp !^'D' -> Re.compl [ Re.digit ] + | cp when CodePage.equal cp !^'b' -> Re.alt [ Re.bow; Re.eow ] + | cp when CodePage.equal cp !^'B' -> Re.not_boundary + | cp when CodePage.equal cp !^'A' -> Re.bos + | cp when CodePage.equal cp !^'Z' -> Re.leol + | cp when CodePage.equal cp !^'z' -> Re.eos + | cp when CodePage.equal cp !^'G' -> Re.start + | cp when CodePage.equal cp !^'e' -> Re.letter !!'\x1b' + | cp when CodePage.equal cp !^'f' -> Re.letter !!'\x0c' + | cp when CodePage.equal cp !^'n' -> Re.letter !!'\n' + | cp when CodePage.equal cp !^'r' -> Re.letter !!'\r' + | cp when CodePage.equal cp !^'t' -> Re.letter !!'\t' + | cp when CodePage.equal cp !^'Q' -> quote (Buffer.create 12) + | cp when CodePage.equal cp !^'E' -> raise Parse_error + | cp when CodePage.equal cp !^'x' -> + let c1, c2 = + match in_brace ~init:[] ~f:(fun acc -> hexdigit () :: acc) with + | Some [ c1; c2 ] -> + (c1, c2) + | Some [ c2 ] -> + (0, c2) + | Some _l -> + raise Parse_error + | None -> + let c1 = hexdigit () in + let c2 = hexdigit () in + (c1, c2) + in + let code = (c1 * 16) + c2 in + let t = Re.letter (letter_of_int code) in + t + | cp when CodePage.equal cp !^'o' -> ( + match + in_brace ~init:[] ~f:(fun acc -> + match maybe_octaldigit () with + | None -> raise Parse_error + | Some p -> p :: acc) + with + | None -> raise Parse_error + | Some digits -> + Re.letter (letter_of_int (acc_digits ~base:8 ~digits))) + | cp when Cset.mem cp Cset.calpha -> raise Parse_error + | cp when Cset.mem cp Cset.cdigit -> + let n1 = + CodePage.to_int cp - CodePage.(to_int @@ from_letter @@ of_char '0') + in + if n1 < 8 then + let n2 = maybe_octaldigit () in + let n3 = maybe_octaldigit () in + match (n2, n3) with + | Some n2, Some n3 -> + Re.letter (letter_of_int ((n1 * (8 * 8)) + (n2 * 8) + n3)) + | _, _ -> raise Not_supported + else raise Not_supported + | c -> Re.letter @@ CodePage.to_letter c) + else ( + if eos () then raise Parse_error; + match CodePage.from_letter (get ()) with + | cp when CodePage.equal cp !^'*' -> raise Parse_error + | cp when CodePage.equal cp !^'+' -> raise Parse_error + | cp when CodePage.equal cp !^'?' -> raise Parse_error + | cp when CodePage.equal cp !^')' -> raise Parse_error + | cp when CodePage.equal cp !^'\\' -> raise Parse_error + | c -> Re.letter @@ CodePage.to_letter c) + and quote buf = + if accept !!'\\' then ( + if eos () then raise Parse_error; + match CodePage.from_letter (get ()) with + | cp when CodePage.equal cp !^'E' -> + let qs = Buffer.contents buf in + Re.str qs + | cp -> + Cset.Codec.add buf !!'\\'; + Cset.Codec.add buf @@ CodePage.to_letter cp; + quote buf) + else ( + if eos () then raise Parse_error; + Cset.Codec.add buf @@ get (); + quote buf) + and hexdigit () = + if eos () then raise Parse_error; + match CodePage.from_letter (get ()) with + | cp when Cset.mem cp Cset.cdigit -> + CodePage.to_int cp - CodePage.(to_int @@ from_letter @@ of_char '0') + | cp when Cset.mem cp xdigit_lowercase -> + CodePage.to_int cp + - CodePage.(to_int @@ from_letter @@ of_char 'a') + + 10 + | cp when Cset.mem cp xdigit_uppercase -> + CodePage.to_int cp + - CodePage.(to_int @@ from_letter @@ of_char 'A') + + 10 + | _ -> raise Parse_error + and maybe_octaldigit () = + if eos () then None + else + match CodePage.from_letter (get ()) with + | cp when Cset.mem cp Cset.cdigit -> + let n = + CodePage.to_int cp - CodePage.(to_int @@ from_letter @@ of_char '0') + in + if n < 8 then Some n else None + | _ -> None + and name () = + if eos () then raise Parse_error + else + match CodePage.from_letter @@ get () with + | cp when Cset.mem cp cname -> + let b = Buffer.create 32 in + Cset.Codec.add b @@ CodePage.to_letter cp; + name' b + | _ -> raise Parse_error + and name' b = + if eos () then raise Parse_error + else + match CodePage.from_letter @@ get () with + | cp when Cset.mem cp Cset.cword -> + Cset.Codec.add b @@ CodePage.to_letter cp; + name' b + | cp when CodePage.equal cp !^'>' -> Buffer.contents b + | _ -> raise Parse_error + and bracket s = + if s <> [] && accept !!']' then s + else + match letter () with + | Set st -> bracket (st :: s) + | Letter l -> + if accept !!'-' then + if accept !!']' then Re.letter l :: Re.letter !!'-' :: s + else + bracket + (match letter () with + | Letter l' -> Re.rg l l' :: s + | Set st' -> Re.letter l :: Re.letter !!'-' :: st' :: s) + else bracket (Re.letter l :: s) + and letter () = + if eos () then raise Parse_error; + let l = get () in + if CodePage.equal (CodePage.from_letter l) !^'[' then ( + if accept !!'=' then raise Not_supported; + match Posix_class.parse buf with + | Some set -> Set set + | None -> + if accept !!'.' then ( + if eos () then raise Parse_error; + let l' = get () in + if not (accept !!'.') then raise Not_supported; + if not (accept !!']') then raise Parse_error; + Letter l') + else Letter l) + else if CodePage.equal (CodePage.from_letter l) !^'\\' then ( + if eos () then raise Parse_error; + let l' = get () in + (* XXX + \127, ... + *) + match CodePage.from_letter l' with + | cp when CodePage.equal cp !^'b' -> letter_b + | cp when CodePage.equal cp !^'n' -> letter_newline (*XXX*) + | cp when CodePage.equal cp !^'r' -> letter_cr (*XXX*) + | cp when CodePage.equal cp !^'t' -> letter_tab (*XXX*) + | cp when CodePage.equal cp !^'w' -> word + | cp when CodePage.equal cp !^'W' -> not_word + | cp when CodePage.equal cp !^'s' -> space + | cp when CodePage.equal cp !^'S' -> not_space + | cp when CodePage.equal cp !^'d' -> digit + | cp when CodePage.equal cp !^'D' -> not_digit + | cp when Cset.mem cp Cset.calpha -> raise Parse_error + | cp when Cset.mem cp Cset.cdigit -> raise Not_supported + | _ -> Letter l') + else Letter l + and comment () = + if eos () then raise Parse_error; + if accept !!')' then Re.epsilon + else ( + Parse_buffer.junk buf; + comment ()) + in + let res = regexp () in + if not (eos ()) then raise Parse_error; + res + + let re ?(opts = []) s = + let r = + parse + ~multiline:(List.memq `Multiline opts) + ~dollar_endonly:(List.memq `Dollar_endonly opts) + ~dotall:(List.memq `Dotall opts) ~ungreedy:(List.memq `Ungreedy opts) s + in + let r = if List.memq `Anchored opts then Re.seq [ Re.start; r ] else r in + let r = if List.memq `Caseless opts then Re.no_case r else r in + r + + let compile = Re.compile + let compile_pat ?(opts = []) s = compile (re ~opts s) + + let re_result ?opts s = + match re ?opts s with + | s -> Ok s + | exception Not_supported -> Error `Not_supported + | exception Parse_error -> Error `Parse_error +end diff --git a/lib/unicode/perl.mli b/lib/unicode/perl.mli new file mode 100644 index 00000000..f312cb18 --- /dev/null +++ b/lib/unicode/perl.mli @@ -0,0 +1,59 @@ +(* + RE - A regular expression library + + Copyright (C) 2001 Jerome Vouillon + email: Jerome.Vouillon@pps.jussieu.fr + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation, with + linking exception; either version 2.1 of the License, or (at + your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +*) + +(** Perl-style regular expressions *) + +exception Parse_error + +(** Errors that can be raised during the parsing of the regular expression *) +exception Not_supported + +type opt = + [ `Ungreedy | `Dotall | `Dollar_endonly | `Multiline | `Anchored | `Caseless ] + +module type T = sig + type core + type re + + (** Parsing of a Perl-style regular expression *) + val re : ?opts:opt list -> string -> core + + val re_result : + ?opts:opt list -> string -> (core, [ `Not_supported | `Parse_error ]) result + + (** (Same as [Re.compile]) *) + val compile : core -> re + + (** Regular expression compilation *) + val compile_pat : ?opts:opt list -> string -> re +end + +module Make + (Cset : Cset.T) + (Color_map : + Color_map.T + with type cp = Cset.cp + and type letter = Cset.letter + and type cset_t = Cset.t) : + T + with type core := Core.Make(Cset)(Color_map).t + and type re := Core.Make(Cset)(Color_map).re diff --git a/lib/unicode/pmark.ml b/lib/unicode/pmark.ml new file mode 100644 index 00000000..9686b245 --- /dev/null +++ b/lib/unicode/pmark.ml @@ -0,0 +1,29 @@ +module Pmark = struct + type t = int + + let equal (x : int) (y : int) = x = y + let compare (x : int) (y : int) = compare x y + let r = ref 0 + + let gen () = + incr r; + !r + ;; + + let pp = Format.pp_print_int +end + +include Pmark + +module Set = struct + module Set = Set.Make (Pmark) + + let[@warning "-32"] to_list x = + let open Set in + to_seq x |> List.of_seq + ;; + + include Set +end + +let to_dyn = Dyn.int diff --git a/lib/unicode/pmark.mli b/lib/unicode/pmark.mli new file mode 100644 index 00000000..100ee4ca --- /dev/null +++ b/lib/unicode/pmark.mli @@ -0,0 +1,13 @@ +type t = private int + +val equal : t -> t -> bool +val compare : t -> t -> int +val gen : unit -> t +val pp : t Fmt.t +val to_dyn : t -> Dyn.t + +module Set : sig + include Set.S with type elt = t + + val to_list : t -> elt list +end diff --git a/lib/unicode/posix.ml b/lib/unicode/posix.ml new file mode 100644 index 00000000..b258aeaf --- /dev/null +++ b/lib/unicode/posix.ml @@ -0,0 +1,191 @@ +(* + RE - A regular expression library + + Copyright (C) 2001 Jerome Vouillon + email: Jerome.Vouillon@pps.jussieu.fr + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation, with + linking exception; either version 2.1 of the License, or (at + your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +*) + +(* + What we could (should?) do: + - a* ==> longest ((shortest (no_group a)* ), a | ()) (!!!) + - abc understood as (ab)c + - "((a?)|b)" against "ab" should not bind the first subpattern to anything + + Note that it should be possible to handle "(((ab)c)d)e" efficiently +*) + +exception Parse_error = Parse_buffer.Parse_error +exception Not_supported + +type opt = [ `ICase | `NoSub | `Newline ] + +module type T = sig + type core + type re + + (** Parsing of a Posix extended regular expression *) + val re : ?opts:opt list -> string -> core + + val re_result : + ?opts:opt list -> string -> (core, [ `Not_supported | `Parse_error ]) result + + (** [compile r] is defined as [Core.compile (Core.longest r)] *) + val compile : core -> re + + (** [compile_pat ?opts regex] compiles the Posix extended regular expression + [regexp] *) + val compile_pat : ?opts:opt list -> string -> re +end + +module Make + (Cset : Cset.T) + (Color_map : + Color_map.T + with type cp = Cset.cp + and type letter = Cset.letter + and type cset_t = Cset.t) = +struct + module Re = Core.Make (Cset) (Color_map) + + type core = Re.t + type re = Re.re + + module CodePage = Cset.CodePage + module Categories = CodePage.Categories + module Posix_class = Posix_class.Make (Cset) (Color_map) + module Parse_buffer = Posix_class.Parse_buffer + + type elem = Letter of Cset.letter | Set of Re.t + + let ( !! ) = CodePage.of_char + let ( !^ ) = fun x -> CodePage.from_letter @@ CodePage.of_char x + + let parse newline s = + let buf = Parse_buffer.create s in + let accept = Parse_buffer.accept buf in + let eos () = Parse_buffer.eos buf in + let test c = Parse_buffer.test buf c in + let unget () = Parse_buffer.unget buf in + let get () = Parse_buffer.get buf in + let rec regexp () = regexp' [ branch () ] + and regexp' left = + if accept !!'|' then regexp' (branch () :: left) + else Re.alt (List.rev left) + and branch () = branch' [] + and branch' left = + if eos () || (fst @@ test !!'|') || (fst @@ test !!')') then + Re.seq (List.rev left) + else branch' (piece () :: left) + and piece () = + let r = atom () in + if accept !!'*' then Re.rep (Re.nest r) + else if accept !!'+' then Re.rep1 (Re.nest r) + else if accept !!'?' then Re.opt r + else if accept !!'{' then ( + match Parse_buffer.integer buf with + | Some i -> + let j = if accept !!',' then Parse_buffer.integer buf else Some i in + if not (accept !!'}') then raise Parse_error; + (match j with Some j when j < i -> raise Parse_error | _ -> ()); + Re.repn (Re.nest r) i j + | None -> + unget (); + r) + else r + and atom () = + if accept !!'.' then if newline then Re.notnl else Re.any + else if accept !!'(' then ( + let r = regexp () in + if not (accept !!')') then raise Parse_error; + Re.group r) + else if accept !!'^' then if newline then Re.bol else Re.bos + else if accept !!'$' then if newline then Re.eol else Re.eos + else if accept !!'[' then + if accept !!'^' then + Re.diff (Re.compl (bracket [])) (Re.letter @@ CodePage.of_char '\n') + else Re.alt (bracket []) + else if accept !!'\\' then ( + if eos () then raise Parse_error; + match CodePage.from_letter @@ get () with + | cp + when CodePage.equal cp !^'|' || CodePage.equal cp !^'(' + || CodePage.equal cp !^')' || CodePage.equal cp !^'*' + || CodePage.equal cp !^'+' || CodePage.equal cp !^'?' + || CodePage.equal cp !^'[' || CodePage.equal cp !^'.' + || CodePage.equal cp !^'^' || CodePage.equal cp !^'$' + || CodePage.equal cp !^'{' || CodePage.equal cp !^'\\' -> + Re.letter @@ CodePage.to_letter cp + | _ -> raise Parse_error) + else ( + if eos () then raise Parse_error; + match CodePage.from_letter @@ get () with + | cp + when CodePage.equal cp !^'*' || CodePage.equal cp !^'+' + || CodePage.equal cp !^'?' || CodePage.equal cp !^'{' + || CodePage.equal cp !^'\\' -> + raise Parse_error + | cp -> Re.letter @@ CodePage.to_letter cp) + and bracket s = + if s <> [] && accept !!']' then s + else + match letter () with + | Set st -> bracket (st :: s) + | Letter l -> + if accept !!'-' then + if accept !!']' then Re.letter l :: Re.letter !!'-' :: s + else + bracket + (match letter () with + | Letter l' -> Re.rg l l' :: s + | Set st' -> Re.letter l :: Re.letter !!'-' :: st' :: s) + else bracket (Re.letter l :: s) + and letter () = + if eos () then raise Parse_error; + let l = get () in + if Cset.Codec.equal l !!'[' then + match Posix_class.parse buf with + | Some set -> Set set + | None -> + if accept !!'.' then ( + if eos () then raise Parse_error; + let l' = get () in + if not (accept !!'.') then raise Not_supported; + if not (accept !!']') then raise Parse_error; + Letter l') + else Letter l + else Letter l + in + let res = regexp () in + if not (eos ()) then raise Parse_error; + res + + let re ?(opts = []) s = + let r = parse (List.memq `Newline opts) s in + let r = if List.memq `ICase opts then Re.no_case r else r in + let r = if List.memq `NoSub opts then Re.no_group r else r in + r + + let re_result ?opts s = + match re ?opts s with + | s -> Ok s + | exception Not_supported -> Error `Not_supported + | exception Parse_error -> Error `Parse_error + + let compile re = Re.compile (Re.longest re) + let compile_pat ?(opts = []) s = compile (re ~opts s) +end diff --git a/lib/unicode/posix.mli b/lib/unicode/posix.mli new file mode 100644 index 00000000..d02c7442 --- /dev/null +++ b/lib/unicode/posix.mli @@ -0,0 +1,114 @@ +(* + RE - A regular expression library + + Copyright (C) 2001 Jerome Vouillon + email: Jerome.Vouillon@pps.jussieu.fr + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation, with + linking exception; either version 2.1 of the License, or (at + your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +*) + +(** References: + - {{:http://www.opengroup.org/onlinepubs/007908799/xbd/re.html} re} + - {{:http://www.opengroup.org/onlinepubs/007908799/xsh/regcomp.html} + regcomp} + + Example of how to use this module (to parse some IRC logs): + + {[ + type msg = { time : string; author : string; content : string } + + let re = + Core.compile (Re_posix.re "([^:].*:[^:]*:[^:]{2})<.([^>]+)> (.+)$") + + (* parse a line *) + let match_line line = + try + let substrings = Core.exec re line in + let groups = Core.get_all substrings in + (* groups can be obtained directly by index within [substrings] *) + Some { time = groups.(1); author = groups.(2); content = groups.(3) } + with Not_found -> None (* regex didn't match *) + ]} *) + +(* XXX Character classes *) + +exception Parse_error + +(** Errors that can be raised during the parsing of the regular expression *) +exception Not_supported + +type opt = [ `ICase | `NoSub | `Newline ] + +module type T = sig + type core + type re + + (** Parsing of a Posix extended regular expression *) + val re : ?opts:opt list -> string -> core + + val re_result : + ?opts:opt list -> string -> (core, [ `Not_supported | `Parse_error ]) result + + (** [compile r] is defined as [Core.compile (Core.longest r)] *) + val compile : core -> re + + (** [compile_pat ?opts regex] compiles the Posix extended regular expression + [regexp] *) + val compile_pat : ?opts:opt list -> string -> re +end + +module Make + (Cset : Cset.T) + (Color_map : + Color_map.T + with type cp = Cset.cp + and type letter = Cset.letter + and type cset_t = Cset.t) : + T + with type core = Core.Make(Cset)(Color_map).t + and type re = Core.Make(Cset)(Color_map).re + +(* + Deviation from the standard / ambiguities in the standard + --------------------------------------------------------- + We tested the behavior of the Linux library (glibc) and the Solaris + library. + + (1) An expression [efg] should be parsed as [(ef)g]. + All implementations parse it as [e(fg)]. + (2) When matching the pattern "((a)|b)*" against the string "ab", + the sub-expression "((a)|b)" should match "b", and the + sub-expression "(a)" should not match anything. + In both implementation, the sub-expression "(a)" matches "a". + (3) When matching the pattern "(aa?)*" against the string "aaa", it is + not clear whether the final match of the sub-expression "(aa?)" is + the last "a" (all matches of the sub-expression are successively + maximized), or "aa" (the final match is maximized). + Both implementations implements the first case. + (4) When matching the pattern "((a?)|b)*" against the string "ab", + the sub-expression "((a?)|b)" should match the empty string at the + end of the string (it is better to match the empty string than to + match nothing). + In both implementations, this sub-expression matches "b". + (Strangely, in the Linux implementation, the sub-expression "(a?)" + correctly matches the empty string at the end of the string) + + This library behaves the same way as the other libraries for all + points, except for (2) and (4) where it follows the standard. + + The behavior of this library in theses four cases may change in future + releases. +*) diff --git a/lib/unicode/posix_class.ml b/lib/unicode/posix_class.ml new file mode 100644 index 00000000..4219f642 --- /dev/null +++ b/lib/unicode/posix_class.ml @@ -0,0 +1,82 @@ +module type T = sig + type core + type letter + + module Parse_buffer : Parse_buffer.T with type letter = letter + + val names : string list + val of_name : string -> core + val parse : Parse_buffer.t -> core option +end + +module Make (Cset : Cset.T) + (Color_map : + Color_map.T + with type cp = Cset.cp + and type letter = Cset.letter + and type cset_t = Cset.t) = struct + + module Core = Core.Make(Cset)(Color_map) + + type core = Core.t + type letter = Cset.letter + + module Re = Core + + module Parse_buffer = struct + include Parse_buffer + include Parse_buffer.Make (Cset) + end + + let ( !! ) = Cset.CodePage.of_char + + let of_name = function + | "alpha" -> Re.alpha + | "alnum" -> Re.alnum + | "ascii" -> Re.ascii + | "blank" -> Re.blank + | "cntrl" -> Re.cntrl + | "digit" -> Re.digit + | "lower" -> Re.lower + | "print" -> Re.print + | "space" -> Re.space + | "upper" -> Re.upper + | "word" -> Re.wordc + | "punct" -> Re.punct + | "graph" -> Re.graph + | "xdigit" -> Re.xdigit + | class_ -> invalid_arg ("Invalid pcre class: " ^ class_) + + let names = + [ + "alpha"; + "alnum"; + "ascii"; + "blank"; + "cntrl"; + "digit"; + "lower"; + "print"; + "space"; + "upper"; + "word"; + "punct"; + "graph"; + "xdigit"; + ] + + let parse buf = + let accept = Parse_buffer.accept buf in + let accept_s = Parse_buffer.accept_s buf in + match accept !!':' with + | false -> None + | true -> + let compl = accept !!'^' in + let cls = + try List.find accept_s names + with Not_found -> raise Parse_buffer.Parse_error + in + if not (accept_s ":]") then raise Parse_buffer.Parse_error; + let posix_class = of_name cls in + Some (if compl then Re.compl [ posix_class ] else posix_class) +end diff --git a/lib/unicode/posix_class.mli b/lib/unicode/posix_class.mli new file mode 100644 index 00000000..94d96f49 --- /dev/null +++ b/lib/unicode/posix_class.mli @@ -0,0 +1,19 @@ +module type T = sig + type core + type letter + module Parse_buffer : Parse_buffer.T with type letter = letter + + val names : string list + val of_name : string -> core + val parse : Parse_buffer.t -> core option +end + +module Make (Cset : Cset.T) + (Color_map : + Color_map.T + with type cp = Cset.cp + and type letter = Cset.letter + and type cset_t = Cset.t) : + T + with type core = Core.Make(Cset)(Color_map).t + and type letter = Cset.letter diff --git a/lib/unicode/re_unicode.ml b/lib/unicode/re_unicode.ml new file mode 100644 index 00000000..7aec2ce6 --- /dev/null +++ b/lib/unicode/re_unicode.ml @@ -0,0 +1,122 @@ +module type T = sig + module Cset : Cset.T + + module Color_map : + Color_map.T + with type cp := Cset.cp + and type letter := Cset.letter + and type cset_t := Cset.t + + module Re : sig + include Core.T with type letter := Cset.letter + include Replace.T with type re := re + module View : View.T with type letter := Cset.letter and type ast := t + module Emacs : Emacs.T with type core := t and type re := re + module Glob : Glob.T with type core := t + module Perl : Perl.T with type core := t and type re := re + + module Pcre : + Pcre.T with type core := t and type re := re and type groups := Group.t + + module Posix : Posix.T with type core := t and type re := re + module Str : Str.T + end +end + +module Make + (Cset : Cset.T) + (Color_map : + Color_map.T + with type cp = Cset.cp + and type letter = Cset.letter + and type cset_t = Cset.t) = +struct + module Cset = Cset + module Color_map = Color_map + + module Re = struct + module Core = Core.Make (Cset) (Color_map) + module Replace = Replace.Make (Cset) (Color_map) + include Core + include Replace + module View = View.Make (Cset) (Color_map) + module Emacs = Emacs.Make (Cset) (Color_map) + module Glob = Glob.Make (Cset) (Color_map) + module Perl = Perl.Make (Cset) (Color_map) + module Pcre = Pcre.Make (Cset) (Color_map) + module Posix = Posix.Make (Cset) (Color_map) + module Str = Str.Make (Cset) (Color_map) + end +end + +module Utf8 = Make (Cset.Utf8) (Color_map.Utf8) +module Utf16be = Make (Cset.Utf16be) (Color_map.Utf16be) +module Utf16le = Make (Cset.Utf16le) (Color_map.Utf16le) +module Latin1 = Make (Cset.Latin1) (Color_map.Latin1) + +module Private = struct + include Import + module Fmt = Fmt + module Dyn = Dyn + module Cset = Cset.Utf8 + module Color_map = Color_map.Utf8 + + module Re = struct + module Category = Category.Make (Cset) + + module Automata = struct + include Automata + include Automata.Make (Cset) + end + + module Ast = struct + include Ast + include Ast.Make (Cset) (Color_map) + end + + module Compile = Compile.Make (Cset) (Color_map) + module Core = Core.Make (Cset) (Color_map) + module Replace = Replace.Make (Cset) (Color_map) + include Core + include Replace + module View = View.Make (Cset) (Color_map) + module Emacs = Emacs.Make (Cset) (Color_map) + module Glob = Glob.Make (Cset) (Color_map) + module Perl = Perl.Make (Cset) (Color_map) + module Pcre = Pcre.Make (Cset) (Color_map) + module Posix = Posix.Make (Cset) (Color_map) + module Str = Str.Make (Cset) (Color_map) + end +end + +(* module Utf16be = struct + module Re = struct + module Core = Core.Make (Cset.Utf16be) (Color_map.Utf16be) + module Replace = Replace.Make (Cset.Utf16be) (Color_map.Utf16be) + include Core + include Replace + module View = View.Make (Cset.Utf16be) (Color_map.Utf16be) + module Emacs = Emacs.Make (Cset.Utf16be) (Color_map.Utf16be) + module Glob = Glob.Make (Cset.Utf16be) (Color_map.Utf16be) + module Perl = Perl.Make (Cset.Utf16be) (Color_map.Utf16be) + module Pcre = Pcre.Make (Cset.Utf16be) (Color_map.Utf16be) + module Posix = Posix.Make (Cset.Utf16be) (Color_map.Utf16be) + module Str = Str.Make (Cset.Utf16be) (Color_map.Utf16be) + end +end + +module Utf16le = struct + module Re = struct + module Core = Core.Make (Cset.Utf16le) (Color_map.Utf16le) + module Replace = Replace.Make (Cset.Utf16le) (Color_map.Utf16le) + include Core + include Replace + module View = View.Make (Cset.Utf16le) (Color_map.Utf16le) + module Emacs = Emacs.Make (Cset.Utf16le) (Color_map.Utf16le) + module Glob = Glob.Make (Cset.Utf16le) (Color_map.Utf16le) + module Perl = Perl.Make (Cset.Utf16le) (Color_map.Utf16le) + module Pcre = Pcre.Make (Cset.Utf16le) (Color_map.Utf16le) + module Posix = Posix.Make (Cset.Utf16le) (Color_map.Utf16le) + module Str = Str.Make (Cset.Utf16le) (Color_map.Utf16le) + end +end *) diff --git a/lib/unicode/re_unicode.mli b/lib/unicode/re_unicode.mli new file mode 100644 index 00000000..e56e5de7 --- /dev/null +++ b/lib/unicode/re_unicode.mli @@ -0,0 +1,256 @@ +module type T = sig + module Cset : Cset.T + + module Color_map : + Color_map.T + with type cp := Cset.cp + and type letter := Cset.letter + and type cset_t := Cset.t + + module Re : sig + include Core.T with type letter := Cset.letter + include Replace.T with type re := re + module View : View.T with type letter := Cset.letter and type ast := t + module Emacs : Emacs.T with type core := t and type re := re + module Glob : Glob.T with type core := t + module Perl : Perl.T with type core := t and type re := re + + module Pcre : + Pcre.T with type core := t and type re := re and type groups := Group.t + + module Posix : Posix.T with type core := t and type re := re + module Str : Str.T + end +end + +module Make + (Cset : Cset.T) + (_ : + Color_map.T + with type cp = Cset.cp + and type letter = Cset.letter + and type cset_t = Cset.t) : T + +module Utf8 : T with type Cset.letter := Uchar.t +module Utf16be : T with type Cset.letter := Uchar.t +module Utf16le : T with type Cset.letter := Uchar.t +module Latin1 : T with type Cset.letter := Char.t + +(**/**) +module Private : sig + include module type of Import + + module Fmt : sig + include module type of Fmt + end + + module Dyn : sig + include module type of Dyn + end + + module Cset : sig + include Cset.T with type letter = Uchar.t + end + + module Color_map : sig + include + Color_map.T + with type cp = Cset.cp + and type letter = Cset.letter + and type cset_t = Cset.t + end + + module Re : sig + module Category : sig + include Category.T with type letter = Cset.letter + end + + module Automata : sig + module Ids : sig + type t + + val create : unit -> t + end + + module Sem : sig + type t = [ `Longest | `Shortest | `First ] + + val to_dyn : t -> Dyn.t + val pp : t Fmt.t + end + + module Rep_kind : sig + type t = [ `Greedy | `Non_greedy ] + + val to_dyn : t -> Dyn.t + val pp : t Fmt.t + end + + module Mark : sig + type t = private int + + val compare : t -> t -> int + val equal : t -> t -> bool + val pp : t Fmt.t + val to_dyn : t -> Dyn.t + val start : t + val prev : t -> t + val next : t -> t + val next2 : t -> t + val group_count : t -> int + val outside_range : t -> start_inclusive:t -> stop_inclusive:t -> bool + end + + module Idx : sig + type t + + val to_int : t -> int + end + + module Status : sig + type t = Failed | Match of Mark_infos.t * Pmark.Set.t | Running + end + + module type T = sig + type cset + type cp + type category + type expr + + val is_eps : expr -> bool + val pp : expr Fmt.t + val cst : Ids.t -> cset -> expr + val empty : Ids.t -> expr + val alt : Ids.t -> expr list -> expr + val seq : Ids.t -> Sem.t -> expr -> expr -> expr + val eps : Ids.t -> expr + val rep : Ids.t -> Rep_kind.t -> Sem.t -> expr -> expr + val mark : Ids.t -> Mark.t -> expr + val pmark : Ids.t -> Pmark.t -> expr + val erase : Ids.t -> Mark.t -> Mark.t -> expr + val before : Ids.t -> category -> expr + val after : Ids.t -> category -> expr + val rename : Ids.t -> expr -> expr + + (****) + + (* States of the automata *) + + module State : sig + type t + + val pp : t Fmt.t + val dummy : t + val create : category -> expr -> t + val idx : t -> Idx.t + val status_no_mutex : t -> Status.t + val status : Mutex.t -> t -> Status.t + val to_dyn : t -> Dyn.t + + module Table : Hashtbl.S with type key = t + end + + (****) + + (* Computation of the states following a given state *) + + module Working_area : sig + type t + + val create : unit -> t + val index_count : t -> int + end + + val delta : Working_area.t -> category -> cp -> State.t -> State.t + end + + include + T + with type cset = Cset.t + and type cp = Cset.cp + and type category = Category.t + end + + module Ast : sig + type ('a, _) ast = private + | Alternative : 'a list -> ('a, [> `Uncased ]) ast + | No_case : 'a -> ('a, [> `Cased ]) ast + | Case : 'a -> ('a, [> `Cased ]) ast + + type ('a, 'case) gen = private + | Set of 'a + | Ast of (('a, 'case) gen, 'case) ast + | Sequence of ('a, 'case) gen list + | Repeat of ('a, 'case) gen * int * int option + | Beg_of_line + | End_of_line + | Beg_of_word + | End_of_word + | Not_bound + | Beg_of_str + | End_of_str + | Last_end_of_line + | Start + | Stop + | Group of string option * ('a, 'case) gen + | No_group of ('a, 'case) gen + | Nest of ('a, 'case) gen + | Pmark of Pmark.t * ('a, 'case) gen + | Sem of Automata.Sem.t * ('a, 'case) gen + | Sem_greedy of Automata.Rep_kind.t * ('a, 'case) gen + + include + Ast.T + with type letter = Cset.letter + and type cset_t = Cset.t + and type color_map_t = Color_map.t + end + + module Compile : sig + include Compile.T with type ast = Ast.t + end + + module Core : sig + include + Core.T + with type t = Ast.t + and type re = Compile.re + and type letter = Cset.letter + end + + module Replace : sig + include Replace.T with type re = Compile.re + end + + include + module type of Core + with type t = Ast.t + and type re = Compile.re + and type letter = Cset.letter + + include module type of Replace with type re := Compile.re + + module View : + View.T with type ast := Ast.t and type letter := Cset.letter + + module Emacs : Emacs.T with type core := Core.t and type re := Compile.re + module Glob : Glob.T with type core := Core.t + module Perl : Perl.T with type core := Core.t and type re := Compile.re + + module Pcre : + Pcre.T + with type core := Core.t + and type re := Compile.re + and type groups := Group.t + + module Posix : Posix.T with type core := Core.t and type re := Compile.re + module Str : Str.T + end +end +(* module Utf16be : sig + module Re : T with type letter := Uchar.t +end + +module Utf16le : sig + module Re : T with type letter := Uchar.t +end *) diff --git a/lib/unicode/replace.ml b/lib/unicode/replace.ml new file mode 100644 index 00000000..835b37d4 --- /dev/null +++ b/lib/unicode/replace.ml @@ -0,0 +1,103 @@ +module type T = sig + type re + + (** [replace ~all re ~f s] iterates on [s], and replaces every occurrence of + [re] with [f substring] where [substring] is the current match. If + [all = false], then only the first occurrence of [re] is replaced. *) + val replace : + ?pos:int (** Default: 0 *) -> + ?len:int -> + ?all:bool (** Default: true. Otherwise only replace first occurrence *) -> + re (** matched groups *) -> + f:(Group.t -> string) (** how to replace *) -> + string (** string to replace in *) -> + string + + (** [replace_string ~all re ~by s] iterates on [s], and replaces every + occurrence of [re] with [by]. If [all = false], then only the first + occurrence of [re] is replaced. + + {5 Examples:} + {[ + # let regex = Re.compile (Re.char ',');; + val regex : re = + + # Re.replace_string regex ~by:";" "[1,2,3,4,5,6,7]";; + - : string = "[1;2;3;4;5;6;7]" + + # Re.replace_string regex ~all:false ~by:";" "[1,2,3,4,5,6,7]";; + - : string = "[1;2,3,4,5,6,7]" + ]} *) + val replace_string : + ?pos:int (** Default: 0 *) -> + ?len:int -> + ?all:bool (** Default: true. Otherwise only replace first occurrence *) -> + re (** matched groups *) -> + by:string (** replacement string *) -> + string (** string to replace in *) -> + string +end + +module Make + (Cset : Cset.T) + (Color_map : + Color_map.T + with type cp = Cset.cp + and type letter = Cset.letter + and type cset_t = Cset.t) = +struct + module Compile = Compile.Make (Cset) (Color_map) + + type re = Compile.re + + let replace ?(pos = 0) ?len ?(all = true) re ~f s = + if pos < 0 then invalid_arg "Re.replace"; + let limit = + match len with + | None -> String.length s + | Some l -> + if l < 0 || pos + l > String.length s then invalid_arg "Re.replace"; + pos + l + in + (* buffer into which we write the result *) + let buf = Buffer.create (String.length s) in + (* iterate on matched substrings. *) + let rec iter pos on_match = + if pos <= limit then + match + Compile.match_str ~groups:true ~partial:false re s ~pos + ~len:(limit - pos) + with + | Match substr -> + let p1 = Group.start_offset substr 0 |> Group.Offset.get_no_check in + let p2 = Group.stop_offset substr 0 |> Group.Offset.get_no_check in + if pos = p1 && p1 = p2 && on_match then ( + (* if we matched an empty string right after a match, + we must manually advance by 1 *) + if p2 < limit then Buffer.add_char buf s.[p2]; + iter (p2 + 1) false) + else ( + (* add string between previous match and current match *) + Buffer.add_substring buf s pos (p1 - pos); + (* what should we replace the matched group with? *) + let replacing = f substr in + Buffer.add_string buf replacing; + if all then + (* if we matched an empty string, we must manually advance by 1 *) + iter + (if p1 = p2 then ( + (* a non char could be past the end of string. e.g. $ *) + if p2 < limit then Buffer.add_char buf s.[p2]; + p2 + 1) + else p2) + (p1 <> p2) + else Buffer.add_substring buf s p2 (limit - p2)) + | Running _ -> () + | Failed -> Buffer.add_substring buf s pos (limit - pos) + in + iter pos false; + Buffer.contents buf + + let replace_string ?pos ?len ?all re ~by s = + replace ?pos ?len ?all re s ~f:(fun _ -> by) +end diff --git a/lib/unicode/replace.mli b/lib/unicode/replace.mli new file mode 100644 index 00000000..bd797d3b --- /dev/null +++ b/lib/unicode/replace.mli @@ -0,0 +1,48 @@ +module type T = sig + type re + + (** [replace ~all re ~f s] iterates on [s], and replaces every occurrence of + [re] with [f substring] where [substring] is the current match. If + [all = false], then only the first occurrence of [re] is replaced. *) + val replace : + ?pos:int (** Default: 0 *) -> + ?len:int -> + ?all:bool (** Default: true. Otherwise only replace first occurrence *) -> + re (** matched groups *) -> + f:(Group.t -> string) (** how to replace *) -> + string (** string to replace in *) -> + string + + (** [replace_string ~all re ~by s] iterates on [s], and replaces every + occurrence of [re] with [by]. If [all = false], then only the first + occurrence of [re] is replaced. + + {5 Examples:} + {[ + # let regex = Re.compile (Re.char ',');; + val regex : re = + + # Re.replace_string regex ~by:";" "[1,2,3,4,5,6,7]";; + - : string = "[1;2;3;4;5;6;7]" + + # Re.replace_string regex ~all:false ~by:";" "[1,2,3,4,5,6,7]";; + - : string = "[1;2,3,4,5,6,7]" + ]} *) + val replace_string : + ?pos:int (** Default: 0 *) -> + ?len:int -> + ?all:bool (** Default: true. Otherwise only replace first occurrence *) -> + re (** matched groups *) -> + by:string (** replacement string *) -> + string (** string to replace in *) -> + string +end + +module Make + (Cset : Cset.T) + (Color_map : + Color_map.T + with type cp = Cset.cp + and type letter = Cset.letter + and type cset_t = Cset.t) : + T with type re = Compile.Make(Cset)(Color_map).re diff --git a/lib/unicode/search.ml b/lib/unicode/search.ml new file mode 100644 index 00000000..88213bd0 --- /dev/null +++ b/lib/unicode/search.ml @@ -0,0 +1,137 @@ +module type T = sig + type re + + val all : ?pos:int -> ?len:int -> re -> string -> Group.t Seq.t + val matches : ?pos:int -> ?len:int -> re -> string -> string Seq.t + + val split_full : + ?pos:int -> + ?len:int -> + re -> + string -> + [> `Delim of Group.t | `Text of string ] Seq.t + + val split : ?pos:int -> ?len:int -> re -> string -> string Seq.t + val split_delim : ?pos:int -> ?len:int -> re -> string -> string Seq.t +end + +module Make + (Cset : Cset.T) + (Color_map : + Color_map.T + with type cp = Cset.cp + and type letter = Cset.letter + and type cset_t = Cset.t) = +struct + module Compile = Compile.Make (Cset) (Color_map) + + type re = Compile.re + + let all ?(pos = 0) ?len re s : _ Seq.t = + if pos < 0 then invalid_arg "Re.all"; + (* index of the first position we do not consider. + !pos < limit is an invariant *) + let limit = + match len with + | None -> String.length s + | Some l -> + if l < 0 || pos + l > String.length s then invalid_arg "Re.all"; + pos + l + in + (* iterate on matches. When a match is found, search for the next + one just after its end *) + let rec aux pos on_match () = + if pos > limit then Seq.Nil (* no more matches *) + else + match + Compile.match_str ~groups:true ~partial:false re s ~pos + ~len:(limit - pos) + with + | Match substr -> + let p1 = Group.start_offset substr 0 |> Group.Offset.get_no_check in + let p2 = Group.stop_offset substr 0 |> Group.Offset.get_no_check in + if on_match && p1 = pos && p1 = p2 then + (* skip empty match right after a match *) + aux (pos + 1) false () + else + let pos = if p1 = p2 then p2 + 1 else p2 in + Seq.Cons (substr, aux pos (p1 <> p2)) + | Running _ | Failed -> Seq.Nil + in + aux pos false + + let matches ?pos ?len re s : _ Seq.t = + all ?pos ?len re s |> Seq.map (fun sub -> Group.get sub 0) + + let split_full ?(pos = 0) ?len re s : _ Seq.t = + if pos < 0 then invalid_arg "Re.split"; + let limit = + match len with + | None -> String.length s + | Some l -> + if l < 0 || pos + l > String.length s then invalid_arg "Re.split"; + pos + l + in + (* i: start of delimited string + pos: first position after last match of [re] + limit: first index we ignore (!pos < limit is an invariant) *) + let pos0 = pos in + let rec aux state i pos () = + match state with + | `Idle when pos > limit -> + (* We had an empty match at the end of the string *) + assert (i = limit); + Seq.Nil + | `Idle -> ( + match + Compile.match_str ~groups:true ~partial:false re s ~pos + ~len:(limit - pos) + with + | Match substr -> + let p1 = Group.start_offset substr 0 |> Group.Offset.get_no_check in + let p2 = Group.stop_offset substr 0 |> Group.Offset.get_no_check in + let pos = if p1 = p2 then p2 + 1 else p2 in + let old_i = i in + let i = p2 in + if old_i = p1 && p1 = p2 && p1 > pos0 then + (* Skip empty match right after a delimiter *) + aux state i pos () + else if p1 > pos0 then + (* string does not start by a delimiter *) + let text = String.sub s old_i (p1 - old_i) in + let state = `Yield (`Delim substr) in + Seq.Cons (`Text text, aux state i pos) + else Seq.Cons (`Delim substr, aux state i pos) + | Running _ -> Seq.Nil + | Failed -> + if i < limit then + let text = String.sub s i (limit - i) in + (* yield last string *) + Seq.Cons (`Text text, aux state limit pos) + else Seq.Nil) + | `Yield x -> Seq.Cons (x, aux `Idle i pos) + in + aux `Idle pos pos + + let split ?pos ?len re s : _ Seq.t = + let seq = split_full ?pos ?len re s in + let rec filter seq () = + match seq () with + | Seq.Nil -> Seq.Nil + | Seq.Cons (`Delim _, tl) -> filter tl () + | Seq.Cons (`Text s, tl) -> Seq.Cons (s, filter tl) + in + filter seq + + let split_delim ?pos ?len re s : _ Seq.t = + let seq = split_full ?pos ?len re s in + let rec filter ~delim seq () = + match seq () with + | Seq.Nil -> if delim then Seq.Cons ("", fun () -> Seq.Nil) else Seq.Nil + | Seq.Cons (`Delim _, tl) -> + if delim then Seq.Cons ("", fun () -> filter ~delim:true tl ()) + else filter ~delim:true tl () + | Seq.Cons (`Text s, tl) -> Seq.Cons (s, filter ~delim:false tl) + in + filter ~delim:true seq +end diff --git a/lib/unicode/search.mli b/lib/unicode/search.mli new file mode 100644 index 00000000..339f26e4 --- /dev/null +++ b/lib/unicode/search.mli @@ -0,0 +1,25 @@ +module type T = sig + type re + + val all : ?pos:int -> ?len:int -> re -> string -> Group.t Seq.t + val matches : ?pos:int -> ?len:int -> re -> string -> string Seq.t + + val split_full : + ?pos:int -> + ?len:int -> + re -> + string -> + [> `Delim of Group.t | `Text of string ] Seq.t + + val split : ?pos:int -> ?len:int -> re -> string -> string Seq.t + val split_delim : ?pos:int -> ?len:int -> re -> string -> string Seq.t +end + +module Make + (Cset : Cset.T) + (Color_map : + Color_map.T + with type cp = Cset.cp + and type letter = Cset.letter + and type cset_t = Cset.t) : + T with type re = Compile.Make(Cset)(Color_map).re diff --git a/lib/unicode/slice.ml b/lib/unicode/slice.ml new file mode 100644 index 00000000..fb745956 --- /dev/null +++ b/lib/unicode/slice.ml @@ -0,0 +1,70 @@ +open Import + +type t = + { s : string + ; pos : int + ; len : int + } + +module L = struct + type nonrec t = t list + + let get_substring slices ~start ~stop = + if stop = start + then "" + else ( + let slices = + let rec drop slices remains = + if remains = 0 + then slices + else ( + match slices with + | [] -> assert false + | ({ s = _; pos; len } as slice) :: xs -> + let remains' = remains - len in + if remains' >= 0 + then drop xs remains' + else ( + let pos = pos + remains in + let len = len - remains in + { slice with pos; len } :: xs)) + in + drop slices start + in + let buf = Buffer.create (stop - start) in + let rec take slices remains = + if remains > 0 + then ( + match slices with + | [] -> assert false + | { s; pos; len } :: xs -> + let remains' = remains - len in + if remains' > 0 + then ( + Buffer.add_substring buf s pos len; + take xs remains') + else Buffer.add_substring buf s pos remains) + in + take slices (stop - start); + Buffer.contents buf) + ;; + + let rec drop t remains = + if remains = 0 + then t + else ( + match t with + | [] -> [] + | ({ s = _; pos; len } as slice) :: t -> + if remains >= len + then drop t (remains - len) + else ( + let delta = len - remains in + { slice with pos = pos + delta; len = len - delta } :: t)) + ;; + + let drop_rev t remains = + (* TODO Use a proper functional queue *) + if remains = 0 then t else List.rev (drop (List.rev t) remains) + ;; +end diff --git a/lib/unicode/slice.mli b/lib/unicode/slice.mli new file mode 100644 index 00000000..67d9ec20 --- /dev/null +++ b/lib/unicode/slice.mli @@ -0,0 +1,12 @@ +type t = + { s : string + ; pos : int + ; len : int + } + +module L : sig + type nonrec t = t list + + val get_substring : t -> start:int -> stop:int -> string + val drop_rev : t -> int -> t +end diff --git a/lib/unicode/str.ml b/lib/unicode/str.ml new file mode 100644 index 00000000..891384cd --- /dev/null +++ b/lib/unicode/str.ml @@ -0,0 +1,508 @@ +(***********************************************************************) +(* *) +(* Objective Caml *) +(* *) +(* Xavier Leroy, projet Cristal, INRIA Rocquencourt *) +(* *) +(* Copyright 1996 Institut National de Recherche en Informatique et *) +(* en Automatique. All rights reserved. This file is distributed *) +(* under the terms of the GNU Library General Public License, with *) +(* linking exception. *) +(* *) +(***********************************************************************) + +(* Modified by Jerome.Vouillon@pps.jussieu.fr for integration in RE *) + +(* $Id: re_str.ml,v 1.3 2002/07/03 15:47:54 vouillon Exp $ *) + +module type T = sig + (** The type of compiled regular expressions. *) + type regexp + + (** Compile a regular expression. The syntax for regular expressions is the + same as in Gnu Emacs. The special characters are [$^.*+?[]]. The following + constructs are recognized: + - [. ] matches any character except newline + - [* ] (postfix) matches the previous expression zero, one or several + times + - [+ ] (postfix) matches the previous expression one or several times + - [? ] (postfix) matches the previous expression once or not at all + - [[..] ] character set; ranges are denoted with [-], as in [[a-z]]; an + initial [^], as in [[^0-9]], complements the set + - [^ ] matches at beginning of line + - [$ ] matches at end of line + - [\| ] (infix) alternative between two expressions + - [\(..\)] grouping and naming of the enclosed expression + - [\1 ] the text matched by the first [\(...\)] expression ([\2] for + the second expression, etc) + - [\b ] matches word boundaries + - [\ ] quotes special characters. *) + val regexp : string -> regexp + + (** Same as [regexp], but the compiled expression will match text in a + case-insensitive way: uppercase and lowercase letters will be considered + equivalent. *) + val regexp_case_fold : string -> regexp + + (** [Str.quote s] returns a regexp string that matches exactly [s] and nothing + else. *) + val quote : string -> string + + (** [Str.regexp_string s] returns a regular expression that matches exactly + [s] and nothing else. *) + val regexp_string : string -> regexp + + (** [Str.regexp_string_case_fold] is similar to [Str.regexp_string], but the + regexp matches in a case-insensitive way. *) + val regexp_string_case_fold : string -> regexp + + (** {2 String matching and searching} *) + + (** [string_match r s start] tests whether the characters in [s] starting at + position [start] match the regular expression [r]. The first character of + a string has position [0], as usual. *) + val string_match : regexp -> string -> int -> bool + + (** [search_forward r s start] searches the string [s] for a substring + matching the regular expression [r]. The search starts at position [start] + and proceeds towards the end of the string. Return the position of the + first character of the matched substring, or raise [Not_found] if no + substring matches. *) + val search_forward : regexp -> string -> int -> int + + (** Same as [search_forward], but the search proceeds towards the beginning of + the string. *) + val search_backward : regexp -> string -> int -> int + + (** Similar to [string_match], but succeeds whenever the argument string is a + prefix of a string that matches. This includes the case of a true complete + match. *) + val string_partial_match : regexp -> string -> int -> bool + + (** [matched_string s] returns the substring of [s] that was matched by the + latest [string_match], [search_forward] or [search_backward]. The user + must make sure that the parameter [s] is the same string that was passed + to the matching or searching function. *) + val matched_string : string -> string + + (** [match_beginning ()] returns the position of the first character of the + substring that was matched by [string_match], [search_forward] or + [search_backward]. *) + val match_beginning : unit -> int + + (** [match_end ()] returns the position of the character following the last + character of the substring that was matched by [string_match], + [search_forward] or [search_backward]. *) + val match_end : unit -> int + + (** [matched_group n s] returns the substring of [s] that was matched by the + [n]th group [\(...\)] of the regular expression during the latest + [string_match], [search_forward] or [search_backward]. The user must make + sure that the parameter [s] is the same string that was passed to the + matching or searching function. [matched_group n s] raises [Not_found] if + the [n]th group of the regular expression was not matched. This can happen + with groups inside alternatives [\|], options [?] or repetitions [*]. For + instance, the empty string will match [\(a\)*], but [matched_group 1 ""] + will raise [Not_found] because the first group itself was not matched. *) + val matched_group : int -> string -> string + + (** [group_beginning n] returns the position of the first character of the + substring that was matched by the [n]th group of the regular expression. + Raises [Not_found] if the [n]th group of the regular expression was not + matched. *) + val group_beginning : int -> int + + (** [group_end n] returns the position of the character following the last + character of the matched substring. Raises [Not_found] if the [n]th group + of the regular expression was not matched. *) + val group_end : int -> int + + (** {2 Replacement} *) + + (** [global_replace regexp templ s] returns a string identical to [s], except + that all substrings of [s] that match [regexp] have been replaced by + [templ]. The replacement template [templ] can contain [\1], [\2], etc; + these sequences will be replaced by the text matched by the corresponding + group in the regular expression. [\0] stands for the text matched by the + whole regular expression. *) + val global_replace : regexp -> string -> string -> string + + (** Same as [global_replace], except that only the first substring matching + the regular expression is replaced. *) + val replace_first : regexp -> string -> string -> string + + (** [global_substitute regexp subst s] returns a string identical to [s], + except that all substrings of [s] that match [regexp] have been replaced + by the result of function [subst]. The function [subst] is called once for + each matching substring, and receives [s] (the whole text) as argument. *) + val global_substitute : regexp -> (string -> string) -> string -> string + + (** Same as [global_substitute], except that only the first substring matching + the regular expression is replaced. *) + val substitute_first : regexp -> (string -> string) -> string -> string + + (** [replace_matched repl s] returns the replacement text [repl] in which + [\1], [\2], etc. have been replaced by the text matched by the + corresponding groups in the most recent matching operation. [s] must be + the same string that was matched during this matching operation. *) + val replace_matched : string -> string -> string + + (** {2 Splitting} *) + + (** [split r s] splits [s] into substrings, taking as delimiters the + substrings that match [r], and returns the list of substrings. For + instance, [split (regexp "[ \t]+") s] splits [s] into blank-separated + words. An occurrence of the delimiter at the beginning and at the end of + the string is ignored. *) + val split : regexp -> string -> string list + + (** Same as [split], but splits into at most [n] substrings, where [n] is the + extra integer parameter. *) + val bounded_split : regexp -> string -> int -> string list + + (** Same as [split], but occurrences of the delimiter at the beginning and at + the end of the string are recognized and returned as empty strings in the + result. For instance, [split_delim (regexp " ") " abc "] returns + [[""; "abc"; ""]], while [split] with the same arguments returns + [["abc"]]. *) + val split_delim : regexp -> string -> string list + + (** Same as [bounded_split] and [split_delim], but occurrences of the + delimiter at the beginning and at the end of the string are recognized and + returned as empty strings in the result. For instance, + [split_delim (regexp " ") " abc "] returns [[""; "abc"; ""]], while + [split] with the same arguments returns [["abc"]]. *) + val bounded_split_delim : regexp -> string -> int -> string list + + type split_result = Text of string | Delim of string + + (** Same as [split_delim], but returns the delimiters as well as the + substrings contained between delimiters. The former are tagged [Delim] in + the result list; the latter are tagged [Text]. For instance, + [full_split (regexp "[{}]") "{ab}"] returns + [[Delim "{"; Text "ab"; Delim "}"]]. *) + val full_split : regexp -> string -> split_result list + + (** Same as [split_delim] and [bounded_split_delim], but returns the + delimiters as well as the substrings contained between delimiters. The + former are tagged [Delim] in the result list; the latter are tagged + [Text]. For instance, [full_split (regexp "[{}]") "{ab}"] returns + [[Delim "{"; Text "ab"; Delim "}"]]. *) + val bounded_full_split : regexp -> string -> int -> split_result list + + (** {2 Extracting substrings} *) + + (** [string_before s n] returns the substring of all characters of [s] that + precede position [n] (excluding the character at position [n]). *) + val string_before : string -> int -> string + + (** [string_after s n] returns the substring of all characters of [s] that + follow position [n] (including the character at position [n]). *) + val string_after : string -> int -> string + + (** [first_chars s n] returns the first [n] characters of [s]. This is the + same function as [string_before]. *) + val first_chars : string -> int -> string + + (** [last_chars s n] returns the last [n] characters of [s]. *) + val last_chars : string -> int -> string +end + +module Make (Cset : Cset.T) + (Color_map : + Color_map.T + with type cp = Cset.cp + and type letter = Cset.letter + and type cset_t = Cset.t) = struct + +module Ast_ = Ast.Make(Cset)(Color_map) +module Ast = Ast_.Export + + +module Compile = Compile.Make(Cset)(Color_map) +module Core = Core.Make(Cset)(Color_map) +module Emacs = Emacs.Make(Cset)(Color_map) + + +include struct + open Core + + let exec = exec + let exec_partial = exec_partial +end + +type regexp = + { mtch : Compile.re Lazy.t + ; srch : Compile.re Lazy.t + } + +let compile_regexp s c = + let re = Emacs.re_no_emacs ~case:(not c) s in + { mtch = lazy (Compile.compile (Ast.seq [ Ast.start; re ])) + ; srch = lazy (Compile.compile re) + } +;; + + +let state = Domain.DLS.new_key (fun () -> None) + +let string_match re s p = + match exec ~pos:p (Lazy.force re.mtch) s with + | res -> + Domain.DLS.set state (Some res); + true + | exception Not_found -> + Domain.DLS.set state None; + false +;; + +let string_partial_match re s p = + match exec_partial ~pos:p (Lazy.force re.mtch) s with + | `Full -> string_match re s p + | `Partial -> true + | `Mismatch -> false +;; + +let search_forward re s p = + match exec ~pos:p (Lazy.force re.srch) s with + | res -> + Domain.DLS.set state (Some res); + fst (Group.offset res 0) + | exception Not_found -> + Domain.DLS.set state None; + raise Not_found +;; + +let rec search_backward re s p = + match exec ~pos:p (Lazy.force re.mtch) s with + | res -> + Domain.DLS.set state (Some res); + p + | exception Not_found -> + Domain.DLS.set state None; + if p = 0 then raise Not_found else search_backward re s (p - 1) +;; + +let valid_group n = + n >= 0 + && n < 10 + && + match Domain.DLS.get state with + | None -> false + | Some m -> n < Group.nb_groups m +;; + +let offset_group i = + match Domain.DLS.get state with + | Some m -> Group.offset m i + | None -> raise Not_found +;; + +let group_len i = + match offset_group i with + | b, e -> e - b + | exception Not_found -> 0 +;; + +let rec repl_length repl p q len = + if p < len + then + if repl.[p] <> '\\' + then repl_length repl (p + 1) (q + 1) len + else ( + let p = p + 1 in + if p = len then failwith "Str.replace: illegal backslash sequence"; + let q = + match repl.[p] with + | '\\' -> q + 1 + | '0' .. '9' as c -> q + group_len (Char.code c - Char.code '0') + | _ -> q + 2 + in + repl_length repl (p + 1) q len) + else q +;; + +let rec replace orig repl p res q len = + if p < len + then ( + let c = repl.[p] in + if c <> '\\' + then ( + Bytes.set res q c; + replace orig repl (p + 1) res (q + 1) len) + else ( + match repl.[p + 1] with + | '\\' -> + Bytes.set res q '\\'; + replace orig repl (p + 2) res (q + 1) len + | '0' .. '9' as c -> + let d = + let group = Char.code c - Char.code '0' in + match offset_group group with + | exception Not_found -> 0 + | b, e -> + let d = e - b in + if d > 0 then String.blit orig b res q d; + d + in + replace orig repl (p + 2) res (q + d) len + | c -> + Bytes.set res q '\\'; + Bytes.set res (q + 1) c; + replace orig repl (p + 2) res (q + 2) len)) +;; + +let replacement_text repl orig = + let len = String.length repl in + let res = Bytes.create (repl_length repl 0 0 len) in + replace orig repl 0 res 0 (String.length repl); + Bytes.unsafe_to_string res +;; + +let quote s = + let len = String.length s in + let buf = Buffer.create (2 * len) in + for i = 0 to len - 1 do + match s.[i] with + | ('[' | ']' | '*' | '.' | '\\' | '?' | '+' | '^' | '$') as c -> + Buffer.add_char buf '\\'; + Buffer.add_char buf c + | c -> Buffer.add_char buf c + done; + Buffer.contents buf +;; + +let string_before s n = String.sub s 0 n +let string_after s n = String.sub s n (String.length s - n) +let first_chars s n = String.sub s 0 n +let last_chars s n = String.sub s (String.length s - n) n +let regexp e = compile_regexp e false +let regexp_case_fold e = compile_regexp e true +let regexp_string s = compile_regexp (quote s) false +let regexp_string_case_fold s = compile_regexp (quote s) true + +let group_beginning n = + if not (valid_group n) then invalid_arg "Str.group_beginning"; + let pos = fst (offset_group n) in + if pos = -1 then raise Not_found else pos +;; + +let group_end n = + if not (valid_group n) then invalid_arg "Str.group_end"; + let pos = snd (offset_group n) in + if pos = -1 then raise Not_found else pos +;; + +let matched_group n txt = + let b, e = offset_group n in + String.sub txt b (e - b) +;; + +let replace_matched repl matched = replacement_text repl matched + +let match_beginning () = group_beginning 0 +and match_end () = group_end 0 +and matched_string txt = matched_group 0 txt + +let substitute_first expr repl_fun text = + try + let pos = search_forward expr text 0 in + String.concat + "" + [ string_before text pos; repl_fun text; string_after text (match_end ()) ] + with + | Not_found -> text +;; + +let global_substitute expr repl_fun text = + let rec replace accu start last_was_empty = + let startpos = if last_was_empty then start + 1 else start in + if startpos > String.length text + then string_after text start :: accu + else ( + match search_forward expr text startpos with + | pos -> + let end_pos = match_end () in + let repl_text = repl_fun text in + replace + (repl_text :: String.sub text start (pos - start) :: accu) + end_pos + (end_pos = pos) + | exception Not_found -> string_after text start :: accu) + in + String.concat "" (List.rev (replace [] 0 false)) +;; + +let global_replace expr repl text = global_substitute expr (replacement_text repl) text +and replace_first expr repl text = substitute_first expr (replacement_text repl) text + +let search_forward_progress re s p = + let pos = search_forward re s p in + if match_end () > p + then pos + else if p < String.length s + then search_forward re s (p + 1) + else raise Not_found +;; + +let bounded_split expr text num = + let start = if string_match expr text 0 then match_end () else 0 in + let rec split accu start n = + if start >= String.length text + then accu + else if n = 1 + then string_after text start :: accu + else ( + match search_forward_progress expr text start with + | pos -> split (String.sub text start (pos - start) :: accu) (match_end ()) (n - 1) + | exception Not_found -> string_after text start :: accu) + in + List.rev (split [] start num) +;; + +let split expr text = bounded_split expr text 0 + +let bounded_split_delim expr text num = + let rec split accu start n = + if start > String.length text + then accu + else if n = 1 + then string_after text start :: accu + else ( + match search_forward_progress expr text start with + | pos -> split (String.sub text start (pos - start) :: accu) (match_end ()) (n - 1) + | exception Not_found -> string_after text start :: accu) + in + if text = "" then [] else List.rev (split [] 0 num) +;; + +let split_delim expr text = bounded_split_delim expr text 0 + +type split_result = + | Text of string + | Delim of string + +let bounded_full_split expr text num = + let rec split accu start n = + if start >= String.length text + then accu + else if n = 1 + then Text (string_after text start) :: accu + else ( + match search_forward_progress expr text start with + | pos -> + let s = matched_string text in + if pos > start + then + split + (Delim s :: Text (String.sub text start (pos - start)) :: accu) + (match_end ()) + (n - 1) + else split (Delim s :: accu) (match_end ()) (n - 1) + | exception Not_found -> Text (string_after text start) :: accu) + in + List.rev (split [] 0 num) +;; + +let full_split expr text = bounded_full_split expr text 0 +end \ No newline at end of file diff --git a/lib/unicode/str.mli b/lib/unicode/str.mli new file mode 100644 index 00000000..71ed5adf --- /dev/null +++ b/lib/unicode/str.mli @@ -0,0 +1,219 @@ +(***********************************************************************) +(* *) +(* Objective Caml *) +(* *) +(* Xavier Leroy, projet Cristal, INRIA Rocquencourt *) +(* *) +(* Copyright 1996 Institut National de Recherche en Informatique et *) +(* en Automatique. All rights reserved. This file is distributed *) +(* under the terms of the GNU Library General Public License, with *) +(* linking exception. *) +(* *) +(***********************************************************************) + +(* $Id: re_str.mli,v 1.1 2002/01/16 14:16:04 vouillon Exp $ *) + +(** Module [Str]: regular expressions and high-level string processing *) + +(** {2 Regular expressions} *) + +module type T = sig + (** The type of compiled regular expressions. *) + type regexp + + (** Compile a regular expression. The syntax for regular expressions is the + same as in Gnu Emacs. The special characters are [$^.*+?[]]. The following + constructs are recognized: + - [. ] matches any character except newline + - [* ] (postfix) matches the previous expression zero, one or several + times + - [+ ] (postfix) matches the previous expression one or several times + - [? ] (postfix) matches the previous expression once or not at all + - [[..] ] character set; ranges are denoted with [-], as in [[a-z]]; an + initial [^], as in [[^0-9]], complements the set + - [^ ] matches at beginning of line + - [$ ] matches at end of line + - [\| ] (infix) alternative between two expressions + - [\(..\)] grouping and naming of the enclosed expression + - [\1 ] the text matched by the first [\(...\)] expression ([\2] for + the second expression, etc) + - [\b ] matches word boundaries + - [\ ] quotes special characters. *) + val regexp : string -> regexp + + (** Same as [regexp], but the compiled expression will match text in a + case-insensitive way: uppercase and lowercase letters will be considered + equivalent. *) + val regexp_case_fold : string -> regexp + + (** [Str.quote s] returns a regexp string that matches exactly [s] and nothing + else. *) + val quote : string -> string + + (** [Str.regexp_string s] returns a regular expression that matches exactly + [s] and nothing else. *) + val regexp_string : string -> regexp + + (** [Str.regexp_string_case_fold] is similar to [Str.regexp_string], but the + regexp matches in a case-insensitive way. *) + val regexp_string_case_fold : string -> regexp + + (** {2 String matching and searching} *) + + (** [string_match r s start] tests whether the characters in [s] starting at + position [start] match the regular expression [r]. The first character of + a string has position [0], as usual. *) + val string_match : regexp -> string -> int -> bool + + (** [search_forward r s start] searches the string [s] for a substring + matching the regular expression [r]. The search starts at position [start] + and proceeds towards the end of the string. Return the position of the + first character of the matched substring, or raise [Not_found] if no + substring matches. *) + val search_forward : regexp -> string -> int -> int + + (** Same as [search_forward], but the search proceeds towards the beginning of + the string. *) + val search_backward : regexp -> string -> int -> int + + (** Similar to [string_match], but succeeds whenever the argument string is a + prefix of a string that matches. This includes the case of a true complete + match. *) + val string_partial_match : regexp -> string -> int -> bool + + (** [matched_string s] returns the substring of [s] that was matched by the + latest [string_match], [search_forward] or [search_backward]. The user + must make sure that the parameter [s] is the same string that was passed + to the matching or searching function. *) + val matched_string : string -> string + + (** [match_beginning ()] returns the position of the first character of the + substring that was matched by [string_match], [search_forward] or + [search_backward]. *) + val match_beginning : unit -> int + + (** [match_end ()] returns the position of the character following the last + character of the substring that was matched by [string_match], + [search_forward] or [search_backward]. *) + val match_end : unit -> int + + (** [matched_group n s] returns the substring of [s] that was matched by the + [n]th group [\(...\)] of the regular expression during the latest + [string_match], [search_forward] or [search_backward]. The user must make + sure that the parameter [s] is the same string that was passed to the + matching or searching function. [matched_group n s] raises [Not_found] if + the [n]th group of the regular expression was not matched. This can happen + with groups inside alternatives [\|], options [?] or repetitions [*]. For + instance, the empty string will match [\(a\)*], but [matched_group 1 ""] + will raise [Not_found] because the first group itself was not matched. *) + val matched_group : int -> string -> string + + (** [group_beginning n] returns the position of the first character of the + substring that was matched by the [n]th group of the regular expression. + Raises [Not_found] if the [n]th group of the regular expression was not + matched. *) + val group_beginning : int -> int + + (** [group_end n] returns the position of the character following the last + character of the matched substring. Raises [Not_found] if the [n]th group + of the regular expression was not matched. *) + val group_end : int -> int + + (** {2 Replacement} *) + + (** [global_replace regexp templ s] returns a string identical to [s], except + that all substrings of [s] that match [regexp] have been replaced by + [templ]. The replacement template [templ] can contain [\1], [\2], etc; + these sequences will be replaced by the text matched by the corresponding + group in the regular expression. [\0] stands for the text matched by the + whole regular expression. *) + val global_replace : regexp -> string -> string -> string + + (** Same as [global_replace], except that only the first substring matching + the regular expression is replaced. *) + val replace_first : regexp -> string -> string -> string + + (** [global_substitute regexp subst s] returns a string identical to [s], + except that all substrings of [s] that match [regexp] have been replaced + by the result of function [subst]. The function [subst] is called once for + each matching substring, and receives [s] (the whole text) as argument. *) + val global_substitute : regexp -> (string -> string) -> string -> string + + (** Same as [global_substitute], except that only the first substring matching + the regular expression is replaced. *) + val substitute_first : regexp -> (string -> string) -> string -> string + + (** [replace_matched repl s] returns the replacement text [repl] in which + [\1], [\2], etc. have been replaced by the text matched by the + corresponding groups in the most recent matching operation. [s] must be + the same string that was matched during this matching operation. *) + val replace_matched : string -> string -> string + + (** {2 Splitting} *) + + (** [split r s] splits [s] into substrings, taking as delimiters the + substrings that match [r], and returns the list of substrings. For + instance, [split (regexp "[ \t]+") s] splits [s] into blank-separated + words. An occurrence of the delimiter at the beginning and at the end of + the string is ignored. *) + val split : regexp -> string -> string list + + (** Same as [split], but splits into at most [n] substrings, where [n] is the + extra integer parameter. *) + val bounded_split : regexp -> string -> int -> string list + + (** Same as [split], but occurrences of the delimiter at the beginning and at + the end of the string are recognized and returned as empty strings in the + result. For instance, [split_delim (regexp " ") " abc "] returns + [[""; "abc"; ""]], while [split] with the same arguments returns + [["abc"]]. *) + val split_delim : regexp -> string -> string list + + (** Same as [bounded_split] and [split_delim], but occurrences of the + delimiter at the beginning and at the end of the string are recognized and + returned as empty strings in the result. For instance, + [split_delim (regexp " ") " abc "] returns [[""; "abc"; ""]], while + [split] with the same arguments returns [["abc"]]. *) + val bounded_split_delim : regexp -> string -> int -> string list + + type split_result = Text of string | Delim of string + + (** Same as [split_delim], but returns the delimiters as well as the + substrings contained between delimiters. The former are tagged [Delim] in + the result list; the latter are tagged [Text]. For instance, + [full_split (regexp "[{}]") "{ab}"] returns + [[Delim "{"; Text "ab"; Delim "}"]]. *) + val full_split : regexp -> string -> split_result list + + (** Same as [split_delim] and [bounded_split_delim], but returns the + delimiters as well as the substrings contained between delimiters. The + former are tagged [Delim] in the result list; the latter are tagged + [Text]. For instance, [full_split (regexp "[{}]") "{ab}"] returns + [[Delim "{"; Text "ab"; Delim "}"]]. *) + val bounded_full_split : regexp -> string -> int -> split_result list + + (** {2 Extracting substrings} *) + + (** [string_before s n] returns the substring of all characters of [s] that + precede position [n] (excluding the character at position [n]). *) + val string_before : string -> int -> string + + (** [string_after s n] returns the substring of all characters of [s] that + follow position [n] (including the character at position [n]). *) + val string_after : string -> int -> string + + (** [first_chars s n] returns the first [n] characters of [s]. This is the + same function as [string_before]. *) + val first_chars : string -> int -> string + + (** [last_chars s n] returns the last [n] characters of [s]. *) + val last_chars : string -> int -> string +end + +module Make + (Cset : Cset.T) + (_ : + Color_map.T + with type cp = Cset.cp + and type letter = Cset.letter + and type cset_t = Cset.t) : T diff --git a/lib/unicode/uucodecs.ml b/lib/unicode/uucodecs.ml new file mode 100644 index 00000000..021a5244 --- /dev/null +++ b/lib/unicode/uucodecs.ml @@ -0,0 +1,929 @@ +exception CodecError +exception End_of_data + +module type T = sig + type letter + + val version : string + + module Unsafe : sig + val unsafe_slice : string -> int -> int -> bytes + val unsafe_bytes_with_next_pos : string -> int -> bytes * int + val unsafe_bytes : string -> int -> bytes + val unsafe_bytes_rev_with_next_pos : string -> int -> bytes * int + val unsafe_bytes_rev : string -> int -> bytes + end + + val max_width : int + val rep : letter + val new_line : letter + val null : letter + val byte_length : letter -> int + val equal : letter -> letter -> bool + val compare : letter -> letter -> int + val from_int : int -> letter + val to_int : letter -> int + val from_bytes : bytes -> letter + val to_bytes : letter -> bytes + val set : bytes -> int -> letter -> int + val add : Buffer.t -> letter -> unit + val width : ('a -> int -> char) -> 'a -> int -> int + val width_rev : ('a -> int -> char) -> 'a -> int -> int + + (* encoder *) + + type 'a enc + + val encoder_make : (int -> 'a -> bytes -> 'a) -> 'a enc + val encoder_add : 'a enc -> 'a -> bytes -> 'a + val encoder_flush : 'a enc -> 'a -> 'a + val is_encoded : letter -> bool + val iter : (letter -> unit) -> string -> unit + val fold_left : ('acc -> letter -> 'acc) -> 'acc -> string -> 'acc + val fold_right : (letter -> 'acc -> 'acc) -> string -> 'acc -> 'acc + val to_list : string -> letter list + val to_seq : string -> letter Seq.t + + (* pretty printer *) + val pp : Format.formatter -> letter -> unit + val dump : Format.formatter -> letter -> unit +end + +module type Properties = sig + (** normalization form (see {!/uunf/UUnf/page-index}).*) + val form : Uunf.form + + (** see + {{:https://unicode.org/reports/tr15/#Stream_Safe_Text_Format}Stream-Safe + Text Format}. + + Note that if [stream_safe = true], then there is at least one starter + every 31 characters. A starter is a character, in a Unicode Normalization + Form, characterized by its canonical combining class value of zero (see + {!Uunf.ccc}). The Stream Safe Text Format ensures that one combining + grapheme joiner (U+034F) is inserted every 30 non-starters. + + It is interesting to note that its + {b canonical combining class value is zero} and its + {b properties NFC_QC, NFD_QC, NFKC_QC and NFKD_QC are all `True}.*) + val stream_safe : bool +end + +module DefaultProperties : Properties = struct + let form = `NFC + let stream_safe = true +end + +module type Skel = sig + type letter + + include Properties + + val byte_length : letter -> int + val equal : letter -> letter -> bool + val compare : letter -> letter -> int + val from_int : int -> letter + val to_int : letter -> int + val from_bytes : bytes -> letter + val to_bytes : letter -> bytes + val set : bytes -> int -> letter -> int + val add : Buffer.t -> letter -> unit + val width : ('a -> int -> char) -> 'a -> int -> int + val width_rev : ('a -> int -> char) -> 'a -> int -> int + + (*pretty printer *) + val pp : Format.formatter -> letter -> unit + val dump : Format.formatter -> letter -> unit +end + +module MakeUtf8Skel (P : Properties) : Skel with type letter = Uchar.t = struct + type letter = Uchar.t + + include P + + let byte_length t = Uchar.utf_8_byte_length t + let equal = Uchar.equal + let compare = Uchar.compare + let from_int = fun i -> try Uchar.unsafe_of_int i with _ -> raise CodecError + let to_int = Uchar.to_int + + let int_to_4_uint8 n = + if n > 0x10FFFF then raise CodecError + else + let bytes = Bytes.create 4 in + List.iteri (Bytes.set_uint8 bytes) + [ + 0xF0 lor (n lsr 18); + 0x80 lor ((n lsr 12) land 0x3F); + 0x80 lor ((n lsr 6) land 0x3F); + 0x80 lor (n land 0x3F); + ]; + bytes + + let int_to_3_uint8 n = + if n > 0xFFFF then int_to_4_uint8 n + else + let bytes = Bytes.create 3 in + List.iteri (Bytes.set_uint8 bytes) + [ + 0xE0 lor (n lsr 12); + 0x80 lor ((n lsr 6) land 0x3F); + 0x80 lor (n land 0x3F); + ]; + bytes + + let int_to_2_uint8 n = + if n > 0x07FF then int_to_3_uint8 n + else + let bytes = Bytes.create 2 in + List.iteri (Bytes.set_uint8 bytes) + [ 0xC0 lor (n lsr 6); 0x80 lor (n land 0x3F) ]; + bytes + + let to_bytes u = + let n = Uchar.to_int u in + if n > 0x07F then int_to_2_uint8 n + else + let bytes = Bytes.create 1 in + Bytes.set_uint8 bytes 0 n; + bytes + + let set bytes pos t = + let t_bytes = to_bytes t in + let len = Bytes.length t_bytes in + Bytes.blit t_bytes 0 bytes pos len; + len + + let add buf t = Buffer.add_bytes buf @@ to_bytes t + + let pp : Format.formatter -> letter -> unit = + fun ppf cp -> + let len = byte_length cp in + let b = Buffer.create len in + add b cp; + let gc = Buffer.contents b in + if gc = "" then () + else ( + Format.fprintf ppf "@<1>%s" gc; + Buffer.clear b) + + let dump : Format.formatter -> letter -> unit = + fun ppf t -> Format.fprintf ppf "{U+%04X}" (Uchar.to_int t) + + let int_of_2_uint8 n1 n2 = + if n1 < 0xc2 || 0xdf < n1 then raise CodecError; + if n2 < 0x80 || 0xbf < n2 then raise CodecError; + if n2 lsr 6 != 0b10 then raise CodecError; + ((n1 land 0x1f) lsl 6) lor (n2 land 0x3f) + + let int_of_3_uint8 n1 n2 n3 = + if n1 = 0xe0 then ( + if n2 < 0xa0 || 0xbf < n2 then raise CodecError; + if n3 < 0x80 || 0xbf < n3 then raise CodecError) + else ( + if n1 < 0xe1 || 0xef < n1 then raise CodecError; + if n2 < 0x80 || 0xbf < n2 then raise CodecError; + if n3 < 0x80 || 0xbf < n3 then raise CodecError); + if n2 lsr 6 != 0b10 || n3 lsr 6 != 0b10 then raise CodecError; + let p = + ((n1 land 0x0f) lsl 12) lor ((n2 land 0x3f) lsl 6) lor (n3 land 0x3f) + in + if p >= 0xd800 && p <= 0xdf00 then raise CodecError; + p + + let int_of_4_uint8 n1 n2 n3 n4 = + if n1 = 0xf0 then ( + if n2 < 0x90 || 0xbf < n2 then raise CodecError; + if n3 < 0x80 || 0xbf < n3 then raise CodecError; + if n4 < 0x80 || 0xbf < n4 then raise CodecError) + else if n1 = 0xf4 then ( + if n2 < 0x80 || 0x8f < n2 then raise CodecError; + if n3 < 0x80 || 0xbf < n3 then raise CodecError; + if n4 < 0x80 || 0xbf < n4 then raise CodecError) + else ( + if n1 < 0xf1 || 0xf3 < n1 then raise CodecError; + if n2 < 0x80 || 0xbf < n2 then raise CodecError; + if n3 < 0x80 || 0xbf < n3 then raise CodecError; + if n4 < 0x80 || 0xbf < n4 then raise CodecError); + if n2 lsr 6 != 0b10 || n3 lsr 6 != 0b10 || n4 lsr 6 != 0b10 then + raise CodecError; + ((n1 land 0x07) lsl 18) + lor ((n2 land 0x3f) lsl 12) + lor ((n3 land 0x3f) lsl 6) + lor (n4 land 0x3f) + + let from_bytes bytes = + match Bytes.length bytes with + | 1 -> Bytes.get_uint8 bytes 0 |> Uchar.unsafe_of_int + | 2 -> + int_of_2_uint8 (Bytes.get_uint8 bytes 0) (Bytes.get_uint8 bytes 1) + |> Uchar.unsafe_of_int + | 3 -> + int_of_3_uint8 (Bytes.get_uint8 bytes 0) (Bytes.get_uint8 bytes 1) + (Bytes.get_uint8 bytes 2) + |> Uchar.unsafe_of_int + | 4 -> + int_of_4_uint8 (Bytes.get_uint8 bytes 0) (Bytes.get_uint8 bytes 1) + (Bytes.get_uint8 bytes 2) (Bytes.get_uint8 bytes 3) + |> Uchar.unsafe_of_int + | _ -> raise CodecError + + let width f x pos = + let n = f x pos |> Char.code in + if n < 0x80 then 1 + else if n < 0xc2 then raise CodecError + else if n < 0xe0 then 2 + else if n < 0xf0 then 3 + else if n < 0xf5 then 4 + else raise CodecError + + let width_rev f x pos = + let f = fun v ofs -> f v ofs |> Char.code in + let decode_length_pos_3 fn v pos = + let byte = fn v (pos - 3) in + if byte > 0xEF && byte < 0xF5 then 4 else raise CodecError + in + let decode_length_pos_2 fn v pos = + let byte = fn v (pos - 2) in + if byte > 0xDF && byte < 0xF0 then 3 else decode_length_pos_3 fn v pos + in + let decode_length_pos_1 fn v pos = + let byte = fn v (pos - 1) in + if byte > 0xC1 && byte < 0xE0 then 2 else decode_length_pos_2 fn v pos + in + let decode_length_pos_0 fn v pos = + let byte = fn v pos in + if byte < 0x80 then 1 else decode_length_pos_1 fn v pos + in + decode_length_pos_0 f x pos +end + +external unsafe_get_uint16_ne : bytes -> int -> int = "%caml_bytes_get16u" + +external unsafe_set_uint16_ne : bytes -> int -> int -> unit + = "%caml_bytes_set16u" + +external swap16 : int -> int = "%bswap16" + +module MakeUtf16beSkel (P : Properties) : Skel with type letter = Uchar.t = +struct + type letter = Uchar.t + + include P + + let byte_length t = Uchar.utf_16_byte_length t + let equal = Uchar.equal + let compare = Uchar.compare + let from_int = Uchar.unsafe_of_int + let to_int = Uchar.to_int + + let unsafe_get_uint16_be b i = + if Sys.big_endian then unsafe_get_uint16_ne b i + else swap16 (unsafe_get_uint16_ne b i) + + let unsafe_set_uint16_be b i x = + if Sys.big_endian then unsafe_set_uint16_ne b i x + else unsafe_set_uint16_ne b i (swap16 x) + + let to_bytes u = + let n = Uchar.to_int u in + if n > 0xffff then ( + if n > 0x10ffff then raise CodecError + else + let n' = n - 0x10000 in + let hi = 0xd800 lor (n' lsr 10) in + let lo = 0xdc00 lor (n' land 0x3ff) in + let bytes = Bytes.create 4 in + unsafe_set_uint16_be bytes 0 hi; + unsafe_set_uint16_be bytes 2 lo; + bytes) + else + let bytes = Bytes.create 2 in + unsafe_set_uint16_be bytes 0 n; + bytes + + let set bytes pos u = + let n = Uchar.to_int u in + if n > 0xffff then ( + if n > 0x10ffff then raise CodecError + else + let n' = n - 0x10000 in + let hi = 0xd800 lor (n' lsr 10) in + let lo = 0xdc00 lor (n' land 0x3ff) in + unsafe_set_uint16_be bytes pos hi; + unsafe_set_uint16_be bytes (pos + 2) lo; + 4) + else ( + unsafe_set_uint16_be bytes pos n; + 2) + + let add buf t = Buffer.add_bytes buf @@ to_bytes t + + let pp : Format.formatter -> letter -> unit = + fun ppf cp -> + let len = byte_length cp in + let b = Buffer.create len in + add b cp; + let gc = Buffer.contents b in + if gc = "" then () + else ( + Format.fprintf ppf "@<1>%s" gc; + Buffer.clear b) + + let dump : Format.formatter -> letter -> unit = + fun ppf t -> Format.fprintf ppf "{U+%04X}" (Uchar.to_int t) + + let int_of_2_uint8 bytes = + let w = unsafe_get_uint16_be bytes 0 in + if w < 0xd800 || 0xdfff < w then w else raise CodecError + + let int_of_4_uint8 bytes = + let hi = unsafe_get_uint16_be bytes 0 in + let lo = unsafe_get_uint16_be bytes 2 in + if hi < 0xdc00 || hi > 0xdfff then raise CodecError + else (((hi land 0x3ff) lsl 10) lor (lo land 0x3ff)) + 0x10000 + + let from_bytes bytes = + match Bytes.length bytes with + | 2 -> int_of_2_uint8 bytes |> Uchar.unsafe_of_int + | 4 -> int_of_4_uint8 bytes |> Uchar.unsafe_of_int + | _ -> raise CodecError + + let width : ('a -> int -> char) -> 'a -> int -> int = + fun f x pos -> + let buf = Buffer.create 2 in + Buffer.add_char buf @@ f x pos; + Buffer.add_char buf @@ f x (pos + 1); + let hi = unsafe_get_uint16_be (Buffer.to_bytes buf) 0 in + Buffer.clear buf; + if hi < 0xd800 || 0xdfff < hi then 2 + else if hi <= 0xdbff then 4 + else raise CodecError + + let width_rev f x pos = + let decode_length_pos_1 fn v ofs buf = + Buffer.add_char buf @@ fn v (ofs - 3); + Buffer.add_char buf @@ fn v (ofs - 2); + let hi = unsafe_get_uint16_be (Buffer.to_bytes buf) 0 in + Buffer.clear buf; + if hi <= 0xdbff then 4 else raise CodecError + in + let decode_length_pos_0 fn v ofs buf = + Buffer.add_char buf @@ fn v (ofs - 1); + Buffer.add_char buf @@ fn v ofs; + let hi = unsafe_get_uint16_be (Buffer.to_bytes buf) 0 in + Buffer.reset buf; + if hi < 0xd800 || 0xdfff < hi then 2 else decode_length_pos_1 f v ofs buf + in + let buf = Buffer.create 2 in + decode_length_pos_0 f x pos buf +end + +module MakeUtf16leSkel (P : Properties) : Skel with type letter = Uchar.t = +struct + type letter = Uchar.t + + include P + + let byte_length t = Uchar.utf_16_byte_length t + let equal = Uchar.equal + let compare = Uchar.compare + let from_int = Uchar.unsafe_of_int + let to_int = Uchar.to_int + + let unsafe_get_uint16_le b i = + if Sys.big_endian then swap16 (unsafe_get_uint16_ne b i) + else unsafe_get_uint16_ne b i + + let unsafe_set_uint16_le b i x = + if Sys.big_endian then unsafe_set_uint16_ne b i (swap16 x) + else unsafe_set_uint16_ne b i x + + let to_bytes u = + let n = Uchar.to_int u in + if n > 0xffff then ( + if n > 0x10ffff then raise CodecError + else + let n' = n - 0x10000 in + let hi = 0xd800 lor (n' lsr 10) in + let lo = 0xdc00 lor (n' land 0x3ff) in + let bytes = Bytes.create 4 in + unsafe_set_uint16_le bytes 0 hi; + unsafe_set_uint16_le bytes 2 lo; + bytes) + else + let bytes = Bytes.create 2 in + unsafe_set_uint16_le bytes 0 n; + bytes + + let set bytes pos u = + let n = Uchar.to_int u in + if n > 0xffff then ( + if n > 0x10ffff then raise CodecError + else + let n' = n - 0x10000 in + let hi = 0xd800 lor (n' lsr 10) in + let lo = 0xdc00 lor (n' land 0x3ff) in + unsafe_set_uint16_le bytes pos hi; + unsafe_set_uint16_le bytes (pos + 2) lo; + 4) + else ( + unsafe_set_uint16_le bytes pos n; + 2) + + let add buf t = Buffer.add_bytes buf @@ to_bytes t + + let pp : Format.formatter -> letter -> unit = + fun ppf t -> + let len = byte_length t in + let b = Buffer.create len in + add b t; + let gc = Buffer.contents b in + if gc = "" then () + else ( + Format.fprintf ppf "@<1>%s" gc; + Buffer.clear b) + + let dump : Format.formatter -> letter -> unit = + fun ppf t -> Format.fprintf ppf "{U+%04X}" (Uchar.to_int t) + + let int_of_2_uint8 bytes = + let w = unsafe_get_uint16_le bytes 0 in + if w < 0xd800 || 0xdfff < w then w else raise CodecError + + let int_of_4_uint8 bytes = + let hi = unsafe_get_uint16_le bytes 0 in + let lo = unsafe_get_uint16_le bytes 2 in + if hi < 0xdc00 || hi > 0xdfff then raise CodecError + else (((hi land 0x3ff) lsl 10) lor (lo land 0x3ff)) + 0x10000 + + let from_bytes bytes = + match Bytes.length bytes with + | 2 -> int_of_2_uint8 bytes |> Uchar.unsafe_of_int + | 4 -> int_of_4_uint8 bytes |> Uchar.unsafe_of_int + | _ -> raise CodecError + + let width : ('a -> int -> char) -> 'a -> int -> int = + fun f x pos -> + let buf = Buffer.create 2 in + Buffer.add_char buf @@ f x pos; + Buffer.add_char buf @@ f x (pos + 1); + let hi = unsafe_get_uint16_le (Buffer.to_bytes buf) 0 in + Buffer.clear buf; + if hi < 0xd800 || 0xdfff < hi then 2 + else if hi <= 0xdbff then 4 + else raise CodecError + + let width_rev f x pos = + let decode_length_pos_1 fn v ofs buf = + Buffer.add_char buf @@ fn v (ofs - 3); + Buffer.add_char buf @@ fn v (ofs - 2); + let hi = unsafe_get_uint16_le (Buffer.to_bytes buf) 0 in + Buffer.clear buf; + if hi <= 0xdbff then 4 else raise CodecError + in + let decode_length_pos_0 fn v ofs buf = + Buffer.add_char buf @@ fn v (ofs - 1); + Buffer.add_char buf @@ fn v ofs; + let hi = unsafe_get_uint16_le (Buffer.to_bytes buf) 0 in + Buffer.reset buf; + if hi < 0xd800 || 0xdfff < hi then 2 else decode_length_pos_1 f v ofs buf + in + let buf = Buffer.create 2 in + decode_length_pos_0 f x pos buf +end + +module MakeCodec (S : Skel with type letter = Uchar.t) : + T with type letter = Uchar.t = struct + include S + + let version = Unicode.unicode_version + let rep = Uchar.unsafe_of_int 0xfffd (* � *) + let new_line = Uchar.unsafe_of_int 10 + let null = Uchar.unsafe_of_int 0x3fffffff + let cgj_code = 0x034F (* combining grapheme joiner *) + let cgj = Uchar.unsafe_of_int cgj_code + + (* let error_code = 0x1A0000 *) + let max_width = 4 + let max_leading_nonstarters = 30 (* Safe-stream text *) + let chunk_size = 32 + + module Unsafe = struct + let unsafe_slice s ofs len = + if ofs = 0 && String.length s = len then Stdlib.Bytes.of_string s + else + let bytes = Stdlib.Bytes.create len in + Stdlib.Bytes.unsafe_blit + (Stdlib.Bytes.unsafe_of_string s) + ofs bytes 0 len; + bytes + + let unsafe_bytes_with_next_pos s ofs = + let w = width String.unsafe_get s ofs in + (unsafe_slice s ofs w, ofs + w) + + let unsafe_bytes_rev_with_next_pos s ofs = + let w = width_rev String.unsafe_get s ofs in + (unsafe_slice s (ofs + 1 - w) w, ofs - w) + + let unsafe_bytes s ofs = + let w = width String.unsafe_get s ofs in + unsafe_slice s ofs w + + let unsafe_bytes_rev s ofs = + let w = width_rev String.unsafe_get s ofs in + unsafe_slice s (ofs + 1 - w) w + end + + type decomp = { + leading_nonstarters : int; + has_starter : bool; + trailing_nonstarters : int; + length : int; + } + + let empty_d = + { + leading_nonstarters = 0; + has_starter = false; + trailing_nonstarters = 0; + length = 0; + } + + let add_decomp : decomp -> Uchar.t -> decomp = + fun d u -> + match Uunf.ccc u with + | 0 -> { d with has_starter = true; length = succ d.length } + | _ -> ( + match d.has_starter with + | false -> + { + d with + leading_nonstarters = succ d.leading_nonstarters; + length = succ d.length; + } + | true -> + { + d with + trailing_nonstarters = succ d.trailing_nonstarters; + length = succ d.length; + }) + + (* TODO: this would be more efficient if Uunf was with bytes and not Uchar.t?*) + type 'a enc = { + enc_frm : Uunf.t; + enc_nfkd : Uunf.t; + enc_o_append : int -> 'a -> bytes -> 'a; + mutable enc_index : int; + enc_o : bytes; + mutable enc_o_pos : int; + enc_o_len : int; + mutable enc_non_starters : int; (* number of nonstarters *) + } + + let encoder_make : (int -> 'a -> bytes -> 'a) -> 'a enc = + fun enc_o_append -> + let enc_o_len = chunk_size * max_width in + { + enc_frm = Uunf.create S.form; + enc_nfkd = Uunf.create (`NFKD :> Uunf.form); + enc_o_append; + enc_index = 0; + enc_o = Bytes.make enc_o_len '\000'; + enc_o_pos = 0; + enc_o_len; + enc_non_starters = 0; + } + + let empty_buffer enc acc0 = + let rec iter acc bytes pos max = + if pos >= max then acc + else + let w = width Bytes.get bytes pos in + let acc = enc.enc_o_append enc.enc_index acc (Bytes.sub bytes pos w) in + enc.enc_index <- enc.enc_index + 1; + iter acc bytes (pos + w) max + in + iter acc0 (Bytes.sub enc.enc_o 0 enc.enc_o_pos) 0 enc.enc_o_pos + + let resize_buffer enc acc needed = + let buf_len = enc.enc_o_len - enc.enc_o_pos in + if buf_len < needed then empty_buffer enc acc else acc + + let add_output enc acc cp = + (* TODO: this part is not optimized: + we calculate several times the length of t!*) + let needed = S.byte_length cp in + let acc = resize_buffer enc acc needed in + let w = S.set enc.enc_o enc.enc_o_pos cp in + enc.enc_o_pos <- enc.enc_o_pos + w; + acc + + let rec normalize : 'a enc -> 'a -> Uunf.ret -> 'a = + fun enc acc v -> + let r = Uunf.add enc.enc_frm v in + match r with + | `Uchar u -> normalize enc (add_output enc acc u) `Await + | `Await | `End -> acc + + let encoder_flush : 'a enc -> 'a -> 'a = + fun enc acc -> normalize enc acc `End |> fun acc -> empty_buffer enc acc + + let rec add_nfkd : 'a enc -> decomp -> Uunf.ret -> decomp = + fun enc d v -> + match Uunf.add enc.enc_nfkd v with + | `Uchar u -> add_nfkd enc (add_decomp d u) `Await + | `Await | `End -> d + + let insert_cgj : 'a enc -> Uchar.t -> bool = + fun enc u -> + Uunf.reset enc.enc_nfkd; + let d = add_nfkd enc (add_nfkd enc empty_d (`Uchar u)) `End in + if enc.enc_non_starters + d.leading_nonstarters > max_leading_nonstarters + then ( + if not d.has_starter then enc.enc_non_starters <- d.length + else enc.enc_non_starters <- d.trailing_nonstarters; + true) + else ( + if not d.has_starter then + enc.enc_non_starters <- enc.enc_non_starters + d.length + else enc.enc_non_starters <- d.trailing_nonstarters; + false) + + let encoder_add : type a. a enc -> a -> bytes -> a = + fun enc acc bytes -> + let u = from_bytes bytes in + if stream_safe then + if insert_cgj enc u then + normalize enc (normalize enc acc (`Uchar cgj)) (`Uchar u) + else normalize enc acc (`Uchar u) + else normalize enc acc (`Uchar u) + + let quick_check = Unicode.nfx_quick_check form + + let is_encoded : letter -> bool = + fun u -> + match (quick_check u, Uunf.ccc u) with true, 0 -> true | _ -> false + + let to_seq s = + let enc_frm = Uunf.create S.form in + let enc_nfkd = Uunf.create (`NFKD :> Uunf.form) in + let enc_non_starters = ref 0 in + let tmp_uchar = ref None in + let not_ended = ref true in + let max_i = String.length s in + let normalize v = Uunf.add enc_frm v in + let rec add_nfkd d v = + match Uunf.add enc_nfkd v with + | `Uchar u -> add_nfkd (add_decomp d u) `Await + | `Await | `End -> d + in + let insert_cgj u = + Uunf.reset enc_nfkd; + let d = add_nfkd (add_nfkd empty_d (`Uchar u)) `End in + if !enc_non_starters + d.leading_nonstarters > max_leading_nonstarters + then ( + if not d.has_starter then enc_non_starters := d.length + else enc_non_starters := d.trailing_nonstarters; + true) + else ( + if not d.has_starter then + enc_non_starters := !enc_non_starters + d.length + else enc_non_starters := d.trailing_nonstarters; + false) + in + let rec aux pos state () = + match state with + | `Uchar u -> Seq.Cons (u, aux pos `Await) + | `Await -> ( + match normalize `Await with + | `Uchar u -> aux pos (`Uchar u) () + | `Await -> ( + match !tmp_uchar with + | Some u_src -> + tmp_uchar := None; + aux pos (normalize (`Uchar u_src)) () + | None -> + let next_state, w = + if pos >= max_i && !not_ended then ( + not_ended := false; + (normalize `End, 0)) + else + let w = width String.unsafe_get s pos in + let u_src, w = + try (from_bytes @@ Unsafe.unsafe_slice s pos w, w) + with CodecError -> (rep, 1) + in + if stream_safe && insert_cgj u_src then ( + tmp_uchar := Some u_src; + (normalize (`Uchar cgj), w)) + else (normalize (`Uchar u_src), w) + in + aux (pos + w) next_state ()) + | `End -> aux pos `End ()) + | `End -> Seq.Nil + in + aux 0 `Await + + let fold_left f acc0 s = + let enc_frm = Uunf.create S.form in + let enc_nfkd = Uunf.create (`NFKD :> Uunf.form) in + let enc_non_starters = ref 0 in + let tmp_uchar = ref None in + let not_ended = ref true in + let max_i = String.length s in + let normalize v = Uunf.add enc_frm v in + let rec add_nfkd d v = + match Uunf.add enc_nfkd v with + | `Uchar u -> add_nfkd (add_decomp d u) `Await + | `Await | `End -> d + in + let insert_cgj u = + Uunf.reset enc_nfkd; + let d = add_nfkd (add_nfkd empty_d (`Uchar u)) `End in + if !enc_non_starters + d.leading_nonstarters > max_leading_nonstarters + then ( + if not d.has_starter then enc_non_starters := d.length + else enc_non_starters := d.trailing_nonstarters; + true) + else ( + if not d.has_starter then + enc_non_starters := !enc_non_starters + d.length + else enc_non_starters := d.trailing_nonstarters; + false) + in + let rec iter acc pos state = + match state with + | `Uchar u -> iter (f acc u) pos `Await + | `Await -> ( + match normalize `Await with + | `Uchar u -> iter acc pos (`Uchar u) + | `Await -> ( + match !tmp_uchar with + | Some u_src -> + tmp_uchar := None; + iter acc pos (normalize (`Uchar u_src)) + | None -> + let next_state, w = + if pos >= max_i && !not_ended then ( + not_ended := false; + (normalize `End, 0)) + else + let w = width String.unsafe_get s pos in + let u_src, w = + try (from_bytes @@ Unsafe.unsafe_slice s pos w, w) + with CodecError -> (rep, 1) + in + if stream_safe && insert_cgj u_src then ( + tmp_uchar := Some u_src; + (normalize (`Uchar cgj), w)) + else (normalize (`Uchar u_src), w) + in + iter acc (pos + w) next_state) + | `End -> iter acc pos `End) + | `End -> acc + in + iter acc0 0 `Await + + let to_list s = fold_left (fun acc x -> x :: acc) [] s |> List.rev + let iter f s = fold_left (fun _ x -> f x) () s + + let fold_right f s acc0 = + let enc_frm = Uunf.create S.form in + let enc_nfkd = Uunf.create (`NFKD :> Uunf.form) in + let enc_non_starters = ref 0 in + let tmp_uchar = ref None in + let not_ended = ref true in + let max_i = String.length s in + let normalize v = Uunf.add enc_frm v in + let rec add_nfkd d v = + match Uunf.add enc_nfkd v with + | `Uchar u -> add_nfkd (add_decomp d u) `Await + | `Await | `End -> d + in + let insert_cgj u = + Uunf.reset enc_nfkd; + let d = add_nfkd (add_nfkd empty_d (`Uchar u)) `End in + if !enc_non_starters + d.leading_nonstarters > max_leading_nonstarters + then ( + if not d.has_starter then enc_non_starters := d.length + else enc_non_starters := d.trailing_nonstarters; + true) + else ( + if not d.has_starter then + enc_non_starters := !enc_non_starters + d.length + else enc_non_starters := d.trailing_nonstarters; + false) + in + let rec iter acc pos state = + match state with + | `Uchar u -> iter (f u acc) pos `Await + | `Await -> ( + match normalize `Await with + | `Uchar u -> iter acc pos (`Uchar u) + | `Await -> ( + match !tmp_uchar with + | Some u_src -> + tmp_uchar := None; + iter acc pos (normalize (`Uchar u_src)) + | None -> + let next_state, w = + if pos < 0 && !not_ended then ( + not_ended := false; + (normalize `End, 0)) + else + let w = width_rev String.unsafe_get s pos in + let u_src, w = + try (from_bytes @@ Unsafe.unsafe_slice s (pos + 1 - w) w, w) + with CodecError -> (rep, 1) + in + if stream_safe && insert_cgj u_src then ( + tmp_uchar := Some u_src; + (normalize (`Uchar cgj), w)) + else (normalize (`Uchar u_src), w) + in + iter acc (pos - w) next_state) + | `End -> iter acc pos `End) + | `End -> acc + in + iter acc0 (max_i - 1) `Await +end + +module Utf8 : T with type letter = Uchar.t = + MakeCodec (MakeUtf8Skel (DefaultProperties)) + +module Utf16be : T with type letter = Uchar.t = + MakeCodec (MakeUtf16beSkel (DefaultProperties)) + +module Utf16le : T with type letter = Uchar.t = + MakeCodec (MakeUtf16leSkel (DefaultProperties)) + +module Latin1 : T with type letter = char = struct + type letter = char + + let version = Sys.ocaml_version + + module Unsafe = struct + let unsafe_slice : string -> int -> int -> bytes = + fun s ofs len -> Bytes.sub (Bytes.unsafe_of_string s) ofs len + + let unsafe_bytes_with_next_pos : string -> int -> bytes * int = + fun s ofs -> (unsafe_slice s ofs 1, succ ofs) + + let unsafe_bytes : string -> int -> bytes = + fun s ofs -> Bytes.sub (Bytes.unsafe_of_string s) ofs 1 + + let unsafe_bytes_rev_with_next_pos : string -> int -> bytes * int = + fun s ofs -> (unsafe_bytes s ofs, pred ofs) + + let unsafe_bytes_rev : string -> int -> bytes = + fun s ofs -> unsafe_bytes s ofs + end + + let rep = Char.unsafe_chr 32 + let new_line = '\n' + let null = '\000' + let max_width = 1 + let byte_length = fun _ -> 1 + let equal = Char.equal + let compare = Char.compare + let from_int = fun i -> try Char.chr i with _ -> raise CodecError + let to_int = Char.code + let to_bytes = fun t -> Bytes.make 1 @@ t + + let from_bytes = + fun bytes -> try Bytes.get bytes 0 with _ -> raise CodecError + + let set = + fun bytes ofs t -> + try + Bytes.set bytes ofs t; + 1 + with _ -> raise CodecError + + let add = fun buf t -> try Buffer.add_char buf t with _ -> raise CodecError + let width = fun _ _ _ -> 1 + let width_rev = fun _ _ _ -> 1 + + (* encoder *) + + type 'a enc = 'a -> bytes -> 'a + + let encoder_make : (int -> 'a -> bytes -> 'a) -> 'a enc = fun f -> f 0 + let encoder_add : 'a enc -> 'a -> bytes -> 'a = fun enc x bytes -> enc x bytes + let encoder_flush : 'a enc -> 'a -> 'a = fun _enc x -> x + let is_encoded : letter -> bool = fun _ -> true + let to_seq = String.to_seq + let iter = String.iter + let fold_left = String.fold_left + let fold_right = String.fold_right + let to_list s = String.fold_left (fun acc c -> c :: acc) [] s |> List.rev + + (*pretty printer *) + let pp : Format.formatter -> char -> unit = + fun ppf t -> Format.fprintf ppf "%c" t + + let dump : Format.formatter -> char -> unit = + fun ppf t -> Format.fprintf ppf "{%02X}" (Char.code t) +end diff --git a/lib/unicode/uucodecs.mli b/lib/unicode/uucodecs.mli new file mode 100644 index 00000000..88b6ebf5 --- /dev/null +++ b/lib/unicode/uucodecs.mli @@ -0,0 +1,106 @@ +exception CodecError +exception End_of_data + +module type T = sig + type letter + + val version : string + + module Unsafe : sig + val unsafe_slice : string -> int -> int -> bytes + val unsafe_bytes_with_next_pos : string -> int -> bytes * int + val unsafe_bytes : string -> int -> bytes + val unsafe_bytes_rev_with_next_pos : string -> int -> bytes * int + val unsafe_bytes_rev : string -> int -> bytes + end + + val max_width : int + val rep : letter + val new_line : letter + val null : letter + val byte_length : letter -> int + val equal : letter -> letter -> bool + val compare : letter -> letter -> int + val from_int : int -> letter + val to_int : letter -> int + val from_bytes : bytes -> letter + val to_bytes : letter -> bytes + val set : bytes -> int -> letter -> int + val add : Buffer.t -> letter -> unit + val width : ('a -> int -> char) -> 'a -> int -> int + val width_rev : ('a -> int -> char) -> 'a -> int -> int + + (* encoder *) + + type 'a enc + + val encoder_make : (int -> 'a -> bytes -> 'a) -> 'a enc + val encoder_add : 'a enc -> 'a -> bytes -> 'a + val encoder_flush : 'a enc -> 'a -> 'a + val is_encoded : letter -> bool + val iter : (letter -> unit) -> string -> unit + val fold_left : ('acc -> letter -> 'acc) -> 'acc -> string -> 'acc + val fold_right : (letter -> 'acc -> 'acc) -> string -> 'acc -> 'acc + val to_list : string -> letter list + val to_seq : string -> letter Seq.t + + (* pretty printer *) + val pp : Format.formatter -> letter -> unit + val dump : Format.formatter -> letter -> unit +end + +module type Properties = sig + (** normalization form (see {!/uunf/UUnf/page-index}).*) + val form : Uunf.form + + (** see + {{:https://unicode.org/reports/tr15/#Stream_Safe_Text_Format}Stream-Safe + Text Format}. + + Note that if [stream_safe = true], then there is at least one starter + every 31 characters. A starter is a character, in a Unicode Normalization + Form, characterized by its canonical combining class value of zero (see + {!Uunf.ccc}). The Stream Safe Text Format ensures that one combining + grapheme joiner (U+034F) is inserted every 30 non-starters. + + It is interesting to note that its + {b canonical combining class value is zero} and its + {b properties NFC_QC, NFD_QC, NFKC_QC and NFKD_QC are all `True}.*) + val stream_safe : bool +end + +module DefaultProperties : Properties + +module type Skel = sig + type letter + + include Properties + + val byte_length : letter -> int + val equal : letter -> letter -> bool + val compare : letter -> letter -> int + val from_int : int -> letter + val to_int : letter -> int + val from_bytes : bytes -> letter + val to_bytes : letter -> bytes + val set : bytes -> int -> letter -> int + val add : Buffer.t -> letter -> unit + val width : ('a -> int -> char) -> 'a -> int -> int + val width_rev : ('a -> int -> char) -> 'a -> int -> int + + (*pretty printer *) + val pp : Format.formatter -> letter -> unit + val dump : Format.formatter -> letter -> unit +end + +module MakeUtf8Skel (_ : Properties) : Skel with type letter = Uchar.t +module MakeUtf16beSkel (_ : Properties) : Skel with type letter = Uchar.t +module MakeUtf16leSkel (_ : Properties) : Skel with type letter = Uchar.t + +module MakeCodec (_ : Skel with type letter = Uchar.t) : + T with type letter = Uchar.t + +module Utf8 : T with type letter = Uchar.t +module Utf16be : T with type letter = Uchar.t +module Utf16le : T with type letter = Uchar.t +module Latin1 : T with type letter = Char.t \ No newline at end of file diff --git a/lib/unicode/view.ml b/lib/unicode/view.ml new file mode 100644 index 00000000..dffa7c72 --- /dev/null +++ b/lib/unicode/view.ml @@ -0,0 +1,165 @@ +open Import + +module type T = sig + type letter + type ast + + + module Cset : sig + + type t + + module Range : sig + type t + + val first : t -> letter + val last : t -> letter + end + + val view : t -> Range.t list + end + + module Sem : sig + type t = [ `Longest | `Shortest | `First ] + end + + module Rep_kind : sig + type t = [ `Greedy | `Non_greedy ] + end + + type view = + | Set of Cset.t + | Sequence of ast list + | Alternative of ast list + | Repeat of ast * int * int option + | Beg_of_line + | End_of_line + | Beg_of_word + | End_of_word + | Not_bound + | Beg_of_str + | End_of_str + | Last_end_of_line + | Start + | Stop + | Sem of Sem.t * ast + | Sem_greedy of Rep_kind.t * ast + | Group of string option * ast + | No_group of ast + | Nest of ast + | Case of ast + | No_case of ast + | Intersection of ast list + | Complement of ast list + | Difference of ast * ast + | Pmark of Pmark.t * ast + + val view : ast -> view +end + +module Make + (Cset : Cset.T) + (Color_map : + Color_map.T + with type cp = Cset.cp + and type letter = Cset.letter + and type cset_t = Cset.t) = +struct + module Ast = struct + include Ast + include Ast.Make (Cset) (Color_map) + end + type letter = Cset.letter + type ast = Ast.t + + module Cset = struct + type t = Cset.t + + module Range = struct + type t = { first : letter; last : letter } + + let first t = t.first + let last t = t.last + end + + let view t = + Cset.fold_right t ~init:[] ~f:(fun first last acc -> + let range = + { + Range.first = Cset.CodePage.to_letter first; + last = Cset.CodePage.to_letter last; + } + in + range :: acc) + end + + (* type letter = Cset.letter + type cset = Cset.t + type ast = Ast.t *) + + module Sem = Automata.Sem + module Rep_kind = Automata.Rep_kind + + type view = + | Set of Cset.t + | Sequence of Ast.t list + | Alternative of Ast.t list + | Repeat of Ast.t * int * int option + | Beg_of_line + | End_of_line + | Beg_of_word + | End_of_word + | Not_bound + | Beg_of_str + | End_of_str + | Last_end_of_line + | Start + | Stop + | Sem of Sem.t * Ast.t + | Sem_greedy of Rep_kind.t * Ast.t + | Group of string option * Ast.t + | No_group of Ast.t + | Nest of Ast.t + | Case of Ast.t + | No_case of Ast.t + | Intersection of Ast.t list + | Complement of Ast.t list + | Difference of Ast.t * Ast.t + | Pmark of Pmark.t * Ast.t + + let view_ast f (t : _ Ast.ast) : view = + match t with + | Alternative a -> Alternative (List.map ~f a) + | No_case a -> No_case (f a) + | Case a -> Case (f a) + + let view_set (cset : Ast.cset) : view = + match cset with + | Cset set -> Set set + | Intersection sets -> Intersection (List.map sets ~f:Ast.t_of_cset) + | Complement sets -> Complement (List.map sets ~f:Ast.t_of_cset) + | Difference (x, y) -> Difference (Ast.t_of_cset x, Ast.t_of_cset y) + | Cast ast -> view_ast Ast.t_of_cset ast + + let view : Ast.t -> view = function + | Set s -> view_set s + | Ast s -> view_ast (fun x -> x) s + | Sem (sem, a) -> Sem (sem, a) + | Sem_greedy (sem, a) -> Sem_greedy (sem, a) + | Sequence s -> Sequence s + | Repeat (t, x, y) -> Repeat (t, x, y) + | Beg_of_line -> Beg_of_line + | End_of_line -> End_of_line + | Beg_of_word -> Beg_of_word + | End_of_word -> End_of_word + | Not_bound -> Not_bound + | Beg_of_str -> Beg_of_str + | End_of_str -> End_of_str + | Last_end_of_line -> Last_end_of_line + | Start -> Start + | Stop -> Stop + | No_group a -> No_group a + | Group (name, t) -> Group (name, t) + | Nest t -> Nest t + | Pmark (pmark, t) -> Pmark (pmark, t) +end diff --git a/lib/unicode/view.mli b/lib/unicode/view.mli new file mode 100644 index 00000000..1f92a058 --- /dev/null +++ b/lib/unicode/view.mli @@ -0,0 +1,69 @@ +module type T = sig + (** A view of the top-level of a regex. This type is unstable and may change + *) + type letter + + type ast + + module Cset : sig + type t + + module Range : sig + type t + + val first : t -> letter + val last : t -> letter + end + + val view : t -> Range.t list + end + + module Sem : sig + type t = [ `Longest | `Shortest | `First ] + end + + module Rep_kind : sig + type t = [ `Greedy | `Non_greedy ] + end + + type view = + | Set of Cset.t + | Sequence of ast list + | Alternative of ast list + | Repeat of ast * int * int option + | Beg_of_line + | End_of_line + | Beg_of_word + | End_of_word + | Not_bound + | Beg_of_str + | End_of_str + | Last_end_of_line + | Start + | Stop + | Sem of Sem.t * ast + | Sem_greedy of Rep_kind.t * ast + | Group of string option * ast + | No_group of ast + | Nest of ast + | Case of ast + | No_case of ast + | Intersection of ast list + | Complement of ast list + | Difference of ast * ast + | Pmark of Pmark.t * ast + + val view : ast -> view +end + +module Make + (Cset : Cset.T) + (Color_map : + Color_map.T + with type cp = Cset.cp + and type letter = Cset.letter + and type cset_t = Cset.t) : + T + with type ast = Ast.Make(Cset)(Color_map).t + and type Cset.t = Cset.t + and type letter = Cset.letter diff --git a/lib_test/unicode/concurrency/dune b/lib_test/unicode/concurrency/dune new file mode 100644 index 00000000..d3a0049f --- /dev/null +++ b/lib_test/unicode/concurrency/dune @@ -0,0 +1,13 @@ +(test + (name test_unicode) + (build_if + (>= %{ocaml_version} 5.0)) + (action + (pipe-outputs + (setenv + TSAN_OPTIONS + suppressions=suppress.txt + (run %{test})) + (run cat))) + (deps suppress.txt) + (libraries re_unicode)) diff --git a/lib_test/unicode/concurrency/suppress.txt b/lib_test/unicode/concurrency/suppress.txt new file mode 100644 index 00000000..513d95e1 --- /dev/null +++ b/lib_test/unicode/concurrency/suppress.txt @@ -0,0 +1,18 @@ + +# Data race between Compile.State.follow_transition (inlined in Compile.next) +# and Compile.State.set_transition +race_top:^camlRe__Compile.next + +# Data race within Compile.find_initial_state (read/write re.initial_states) +race_top:^camlRe__Compile.find_initial_state + +# Spurious data race due to the two-step initialization in Compile.State.make +# (between Compile.State.get_info and Compile.State.set_info, both inlined) +race_top:^camlRe__Compile.loop + +# Race within Automata.Desc.status and Automata.Desc.status_no_mutex +# (read/write s.status) +race_top:^camlRe__Automata.status + +# Race within Compile.final +race_top:^camlRe__Compile.final diff --git a/lib_test/unicode/concurrency/test.expected b/lib_test/unicode/concurrency/test.expected new file mode 100644 index 00000000..7026d1ff --- /dev/null +++ b/lib_test/unicode/concurrency/test.expected @@ -0,0 +1,2 @@ +Sequential +Concurrent diff --git a/lib_test/unicode/concurrency/test_unicode.ml b/lib_test/unicode/concurrency/test_unicode.ml new file mode 100644 index 00000000..889502cb --- /dev/null +++ b/lib_test/unicode/concurrency/test_unicode.ml @@ -0,0 +1,140 @@ +module Barrier = struct + type t = + { waiters : int Atomic.t + ; size : int + ; passed : int Atomic.t + } + + let create n = { waiters = Atomic.make n; size = n; passed = Atomic.make 0 } + + let await { waiters; size; passed } = + if Atomic.fetch_and_add passed 1 = size - 1 + then ( + Atomic.set passed 0; + Atomic.set waiters 0); + while Atomic.get waiters = size do + Domain.cpu_relax () + done; + Atomic.incr waiters; + while Atomic.get waiters < size do + Domain.cpu_relax () + done + ;; +end + +let shuffle_array a = + let n = Array.length a in + let a' = Array.copy a in + for i = n - 1 downto 1 do + let j = Random.int (i + 1) in + let temp = a'.(i) in + a'.(i) <- a'.(j); + a'.(j) <- temp + done; + a' +;; + +let inverse_permutation p = + let n = Array.length p in + let inv = Array.make n 0 in + for i = 0 to n - 1 do + inv.(p.(i)) <- i + done; + inv +;; + +let apply_permutation p a = + let n = Array.length p in + let b = Array.make n a.(0) in + for i = 0 to n - 1 do + b.(i) <- a.(p.(i)) + done; + b +;; + +(****) + +open Re_unicode.Utf8 + +let re1 = Re.(alt [ group (char 'a' |> letter); char 'b' |> letter ]) +let re2 = Re.(seq [ re1; re1 ]) +let re3 = Re.(seq [ re2; re2 ]) +let re4 = Re.(seq [ re3; re3 ]) + +let re5 = + Re.( + alt + [ seq [ re4; re4 ] + ; group (str "b") + ; group (str "bb") + ; group (str "bbb") + ; group (str "bbbb") + ]) +;; + +let size = 300 + +let strings = + Array.init size (fun _ -> String.init 30 (fun _ -> if Random.bool () then 'a' else 'b')) +;; + +let execute ~short re a = + apply_permutation + (inverse_permutation a) + (Array.map + (fun i -> + try Some (Re.exec ~pos:(if short then 30 - 7 else 0) re strings.(i)) with + | Not_found -> None) + a) +;; + +let compare_groups g g' = Re.Group.(all_offset g = all_offset g') + +let concurrent f f' = + let barrier = Barrier.create 2 in + let domain = + Domain.spawn + @@ fun () -> + Barrier.await barrier; + f' () + in + Barrier.await barrier; + let res = f () in + let res' = Domain.join domain in + res, res' +;; + +let sequential f f' = f (), f' () + +let test compose ~short n = + let success = ref true in + for _ = 1 to n do + let re = Re.compile re5 in + let a = shuffle_array (Array.init size Fun.id) in + let a' = shuffle_array a in + try + let groups, groups' = + compose (fun () -> execute ~short re a) (fun () -> execute ~short re a') + in + let ok = Array.for_all2 (Option.equal compare_groups) groups groups' in + success := !success && ok; + if not ok then prerr_endline "Bad group" + with + | Invalid_argument msg -> + prerr_endline ("Invalid_argument " ^ msg); + success := false + | Division_by_zero -> + prerr_endline "Division_by_zero"; + success := false + done; + if not !success then exit 1 +;; + +let () = + prerr_endline "Sequential"; + test sequential ~short:false 20; + test sequential ~short:true 10; + prerr_endline "Concurrent"; + test ~short:false concurrent 750; + test ~short:true concurrent 250 +;; diff --git a/lib_test/unicode/expect/dune b/lib_test/unicode/expect/dune new file mode 100644 index 00000000..da8764c6 --- /dev/null +++ b/lib_test/unicode/expect/dune @@ -0,0 +1,41 @@ +(library + (name re_tests_unicode) + (libraries + re_private_unicode + ;; This is because of the (implicit_transitive_deps false) + ;; in dune-project + ppx_expect.config + ppx_expect.config_types + ppx_expect + ppx_expect_common_unicode + base + str + ppx_inline_test.config) + (inline_tests + (modes native js)) + (preprocess + (pps ppx_expect))) + +;; ppx_expect v16 depends on ppx_expect.common +(subdir + ppx_expect_common_unicode + (library + (name ppx_expect_common_unicode) + (enabled_if + (< %{ocaml_version} 5.0)) + (libraries (re_export ppx_expect.common))) + (library + (name ppx_expect_common_unicode) + (enabled_if + (>= %{ocaml_version} 5.0)))) + +;; this hackery is needed because ppx_expect itself uses re, therefore we need to mangle +;; the library name + +(subdir + re_private_unicode + (library + (name re_private_unicode) + (libraries uucp uunf)) + (copy_files %{project_root}/lib/unicode/*.{ml,mli}) + ) diff --git a/lib_test/unicode/expect/import.ml b/lib_test/unicode/expect/import.ml new file mode 100644 index 00000000..940454d2 --- /dev/null +++ b/lib_test/unicode/expect/import.ml @@ -0,0 +1,121 @@ +include Re_private_unicode.Import +module Fmt = Re_private_unicode.Fmt +module Dyn = Re_private_unicode.Dyn + +module Cset = Re_private_unicode.Cset.Utf8 +module Color_map = Re_private_unicode.Color_map.Utf8 +(* module Cset = Re_private_unicode.Cset.Latin1 +module Color_map = Re_private_unicode.Color_map.Latin1 *) + +module Re = struct + module Category = Re_private_unicode.Category.Make (Cset) + module Automata = Re_private_unicode.Automata.Make (Cset) + module Ast = Re_private_unicode.Ast.Make (Cset) (Color_map) + module Compile = Re_private_unicode.Compile.Make (Cset) (Color_map) + module Core = Re_private_unicode.Core.Make (Cset) (Color_map) + module Replace = Re_private_unicode.Replace.Make (Cset) (Color_map) + include Core + include Replace + module View = Re_private_unicode.View.Make (Cset) (Color_map) + module Emacs = Re_private_unicode.Emacs.Make (Cset) (Color_map) + module Glob = Re_private_unicode.Glob.Make (Cset) (Color_map) + module Perl = Re_private_unicode.Perl.Make (Cset) (Color_map) + module Pcre = Re_private_unicode.Pcre.Make (Cset) (Color_map) + module Posix = Re_private_unicode.Posix.Make (Cset) (Color_map) + module Str = Re_private_unicode.Str.Make (Cset) (Color_map) +end + +let printf = Printf.printf + +let t re s = + let re = Re.compile re in + let group = Re.exec_opt re s in + Format.printf "%a@." (Fmt.opt Re.Group.pp) group + +let re_whitespace = Re.Pcre.regexp "[\t ]+" +let re_eol = Re.compile Re.eol +let re_bow = Re.compile Re.bow +let re_eow = Re.compile Re.eow + +let strings = + Format.printf "[%a]@." Fmt.(list ~pp_sep:(Fmt.lit "; ") Fmt.quoted_string) + +let re_empty = Re.Posix.compile_pat "" + +let invalid_argument f = + try ignore (f ()) + with Invalid_argument s -> Format.printf "Invalid_argument %S@." s + +let ignore_or_exception f = + try ignore (f ()) + with + | Re_private_unicode.Uucodecs.CodecError -> printf "\"CodecError\"\n" + | exn -> Format.printf "%S@." (Printexc.to_string exn) + +let exec_partial_detailed ?pos re s = + let re = Re.compile re in + let res = Re.exec_partial_detailed ?pos re s in + match res with + | `Mismatch -> Format.printf "`Mismatch@." + | `Partial position -> Format.printf "`Partial %d@." position + | `Full groups -> + Re.Group.all_offset groups |> Array.to_list + |> List.map ~f:(fun (a, b) -> + Printf.sprintf "%d,%d,%s" a b + (match String.sub s a (b - a) with + | exception Invalid_argument _ -> "" + | s -> Printf.sprintf "%S" s)) + |> String.concat ";" + |> Format.printf "`Full [|%s|]@." + +let or_not_found f fmt v = + try f fmt (v ()) with + | Not_found -> Format.fprintf fmt "Not_found" + | exn -> Format.fprintf fmt "%s" (Printexc.to_string exn) + +let array f fmt v = + Format.fprintf fmt "[| %a |]" + (Fmt.list ~pp_sep:(Fmt.lit "; ") f) + (Array.to_list v) + +let offset fmt (x, y) = Format.fprintf fmt "(%d, %d)" x y + +let test_re ?pos ?len r s = + let offsets () = Re.Group.all_offset (Re.exec ?pos ?len (Re.compile r) s) in + Format.printf "%a@." (or_not_found (array offset)) offsets + +let rec sexp_of_dyn (t : Dyn.t) : Base.Sexp.t = + match t with + | Int i -> Atom (Int.to_string i) + | String s -> Atom s + | Tuple xs -> List (List.map xs ~f:sexp_of_dyn) + | Enum s -> Atom s + | Array xs -> List (List.map ~f:sexp_of_dyn @@ Array.to_list xs) + | List xs -> List (List.map ~f:sexp_of_dyn xs) + | Variant (name, []) -> Atom name + | Variant (name, xs) -> ( + let xs = List.map xs ~f:sexp_of_dyn in + match xs with [] -> List [] | xs -> List (Atom name :: xs)) + | Record fields -> + List + (List.filter_map fields ~f:(fun (name, v) -> + match sexp_of_dyn v with + | List [] -> None + | sexp -> Some (Base.Sexp.List [ Atom name; sexp ]))) + +let print_dyn dyn = sexp_of_dyn dyn |> Base.Sexp.to_string_hum |> print_endline + +let test f string = + match f string with + | Ok res -> print_dyn (Re.Ast.to_dyn res) + | Error _ -> assert false + +let string_make_of_int i = + let bytes = Bytes.create 4 in + let w = Cset.Codec.set bytes 0 (Cset.CodePage.of_char @@ Char.chr i) in + Bytes.sub_string bytes 0 w + +let string_make_of_char c = + let buf = Buffer.create 4 in + Cset.Codec.add buf (Cset.CodePage.of_char c); + Buffer.contents buf diff --git a/lib_test/unicode/expect/test_186.ml b/lib_test/unicode/expect/test_186.ml new file mode 100644 index 00000000..32ebbf98 --- /dev/null +++ b/lib_test/unicode/expect/test_186.ml @@ -0,0 +1,78 @@ +open Import + +let print re result = + Printf.printf + "%s: %s\n" + re + (match result with + | Ok _ -> "backward range parsed" + | Error `Parse_error -> "parse error" + | Error `Not_supported -> "not supported") +;; + +let cases = [ "[1-0]"; "[5-1]"; "[6-6]"; "[z-a]"; "[b-b]" ] + +let test f = + List.iter cases ~f:(fun re -> + let result = f re in + print re result) +;; + +let%expect_test "perl" = + test Re.Perl.re_result; + [%expect + {| + [1-0]: backward range parsed + [5-1]: backward range parsed + [6-6]: backward range parsed + [z-a]: backward range parsed + [b-b]: backward range parsed + |}] +;; + +let%expect_test "pcre" = + test Re.Pcre.re_result; + [%expect + {| + [1-0]: backward range parsed + [5-1]: backward range parsed + [6-6]: backward range parsed + [z-a]: backward range parsed + [b-b]: backward range parsed + |}] +;; + +let%expect_test "posix" = + test Re.Posix.re_result; + [%expect + {| + [1-0]: backward range parsed + [5-1]: backward range parsed + [6-6]: backward range parsed + [z-a]: backward range parsed + [b-b]: backward range parsed + |}] +;; + +(* CR-someday rgrinberg: is this correct? *) +let%expect_test "emacs" = + test Re.Emacs.re_result; + [%expect + {| + [1-0]: backward range parsed + [5-1]: backward range parsed + [6-6]: backward range parsed + [z-a]: backward range parsed + [b-b]: backward range parsed + |}] +;; + +module Re = Re_private_unicode.Re_unicode.Utf8.Re + +(* We allow backward ranges in re. We could forbid them? *) +let%expect_test "re" = + Format.printf "%a@." Re.pp (Re.(rg (char '5') (char '0'))); + [%expect {| (Set 48-53) |}]; + Format.printf "%a@." Re.pp (Re.(rg (char '0') (char '5'))); + [%expect {| (Set 48-53) |}] +;; diff --git a/lib_test/unicode/expect/test_alternation.ml b/lib_test/unicode/expect/test_alternation.ml new file mode 100644 index 00000000..96063dbc --- /dev/null +++ b/lib_test/unicode/expect/test_alternation.ml @@ -0,0 +1,28 @@ +open Import + +(* +let test f string = + match f string with + | Ok res -> print_dyn (Ast.to_dyn res) + | Error _ -> assert false + *) + +let%expect_test "pcre" = + test Re.Pcre.re_result "(a|b|c)"; + [%expect + {| (Group (Set (Cast (Alternative (Cset 97) (Cset 98) (Cset 99))))) |}] + +let%expect_test "emacs" = + test Re.Emacs.re_result {|\(a\|b\|c\)|}; + [%expect + {| (Group (Set (Cast (Alternative (Cset 97) (Cset 98) (Cset 99))))) |}] + +let%expect_test "perl" = + test Re.Perl.re_result "(a|b|c)"; + [%expect + {| (Group (Set (Cast (Alternative (Cset 97) (Cset 98) (Cset 99))))) |}] + +let%expect_test "posix" = + test Re.Posix.re_result "(a|b|c)"; + [%expect + {| (Group (Set (Cast (Alternative (Cset 97) (Cset 98) (Cset 99))))) |}] diff --git a/lib_test/unicode/expect/test_automata.ml b/lib_test/unicode/expect/test_automata.ml new file mode 100644 index 00000000..6c0245b9 --- /dev/null +++ b/lib_test/unicode/expect/test_automata.ml @@ -0,0 +1,220 @@ +open! Import + +include struct + module Automata = struct + include Re_private_unicode.Automata + include Re.Automata + end + + let empty = Automata.empty + let eps = Automata.eps + let cst = Automata.cst + let seq = Automata.seq + let rep = Automata.rep +end + +let pp_state state = print_dyn (Automata.State.to_dyn state) +let pp_expr fmt expr = Automata.pp fmt expr +let cat = Re.Category.dummy + +let str ids sem str = + let rec loop s = + match (s () : _ Seq.node) with + | Nil -> eps ids + | Cons (c, rest) -> + let c = cst ids (Cset.csingle c) in + seq ids sem c (loop rest) + in + loop (Cset.Codec.to_seq str) + +let loop ?(max = 100) wa d c = + let cset = Cset.CodePage.(of_char c |> from_letter) in + let rec loop d n = + if n > 0 then ( + print_dyn (Automata.State.to_dyn d); + match Automata.State.status_no_mutex d with + | Failed -> Format.printf "> failed@." + | Match _ -> Format.printf "> matched@." + | Running -> + let d = Automata.delta wa cat cset d in + loop d (n - 1)) + in + loop d max + +let%expect_test "string" = + let re = + let n = 4 in + let s = + let c = 'a' in + String.make n c + in + let ids = Automata.Ids.create () in + str ids `First s + in + let wa = Automata.Working_area.create () in + loop wa (Automata.State.create cat re) 'a'; + [%expect + {| + ((TExp (first (Seq 97 97 97 97)))) + ((TExp (first (Seq 97 97 97)))) + ((TExp (first (Seq 97 97)))) + ((TExp 97)) + ((TExp Eps)) + ((TMarks ())) + > matched + |}]; + loop wa (Automata.State.create cat re) 'b'; + [%expect {| + ((TExp (first (Seq 97 97 97 97)))) + () + > failed + |}] + +let%expect_test "alternation" = + let re = + let ids = Automata.Ids.create () in + let n = 4 in + let s = + let c = 'a' in + String.make n c + in + List.init ~len:n ~f:(fun i -> + let prefix = str ids `First s in + let suffix = + let c = Char.chr (Char.code 'b' + i) in + cst ids (Cset.CodePage.of_char c |> Cset.csingle) + in + seq ids `First prefix suffix) + |> Automata.alt ids + in + let wa = Automata.Working_area.create () in + loop wa (Automata.State.create cat re) 'a'; + [%expect + {| + ((TExp + (Alt (first (Seq (Seq 97 97 97 97) 98)) (first (Seq (Seq 97 97 97 97) 99)) + (first (Seq (Seq 97 97 97 97) 100)) (first (Seq (Seq 97 97 97 97) 101))))) + ((first (TSeq ((TExp (Seq 97 97 97))) 98)) + (first (TSeq ((TExp (Seq 97 97 97))) 99)) + (first (TSeq ((TExp (Seq 97 97 97))) 100)) + (first (TSeq ((TExp (Seq 97 97 97))) 101))) + ((first (TSeq ((TExp (Seq 97 97))) 98)) + (first (TSeq ((TExp (Seq 97 97))) 99)) + (first (TSeq ((TExp (Seq 97 97))) 100)) + (first (TSeq ((TExp (Seq 97 97))) 101))) + ((first (TSeq ((TExp 97)) 98)) (first (TSeq ((TExp 97)) 99)) + (first (TSeq ((TExp 97)) 100)) (first (TSeq ((TExp 97)) 101))) + ((TExp 98) (TExp 99) (TExp 100) (TExp 101)) + () + > failed + |}] + +let%expect_test "alternation shared prefix" = + let n = 4 in + let re = + let ids = Automata.Ids.create () in + let prefix = + let s = + let c = 'a' in + String.make n c + in + str ids `First s + in + let suffix = + List.init ~len:n ~f:(fun i -> + let c = + Cset.CodePage.of_int + (Cset.CodePage.(of_char 'b' |> from_letter |> to_int) + i) + in + cst ids (Cset.CodePage.to_letter c |> Cset.csingle)) + |> Automata.alt ids + in + seq ids `First prefix suffix + in + let wa = Automata.Working_area.create () in + loop wa (Automata.State.create cat re) 'a'; + [%expect + {| + ((TExp (first (Seq (Seq 97 97 97 97) (Alt 98 99 100 101))))) + ((first (TSeq ((TExp (Seq 97 97 97))) (Alt 98 99 100 101)))) + ((first (TSeq ((TExp (Seq 97 97))) (Alt 98 99 100 101)))) + ((first (TSeq ((TExp 97)) (Alt 98 99 100 101)))) + ((TExp (Alt 98 99 100 101))) + () + > failed + |}] + +let%expect_test "kleene star" = + let re = + let ids = Automata.Ids.create () in + rep ids `Greedy `First (cst ids (Cset.CodePage.of_char 'z' |> Cset.csingle)) + in + let wa = Automata.Working_area.create () in + loop ~max:4 wa (Automata.State.create cat re) 'z'; + [%expect + {| + ((TExp (first (Rep 122)))) + ((TExp (first (Rep 122))) (TMarks ())) + ((TExp (first (Rep 122))) (TMarks ())) + ((TExp (first (Rep 122))) (TMarks ())) + |}]; + loop ~max:3 wa (Automata.State.create cat re) 'a'; + [%expect + {| + ((TExp (first (Rep 122)))) + ((TMarks ())) + > matched + |}] + +let%expect_test "derivative recomputation" = + let sem = `Longest in + let re = + let ids = Automata.Ids.create () in + (* let lhs = rep ids `Non_greedy sem (cst ids Cset.cany) in *) + let lhs = + rep ids `Non_greedy sem + (cst ids + (Cset.cseq + Cset.CodePage.(of_int 0 |> to_letter) + Cset.CodePage.(of_int 255 |> to_letter))) + in + let rhs = + seq ids sem + (Automata.mark ids Automata.Mark.start) + (Automata.alt ids + [ + cst ids (Cset.CodePage.of_char 'z' |> Cset.csingle); + cst ids (Cset.CodePage.of_char 'b' |> Cset.csingle); + ]) + in + seq ids sem lhs rhs + in + let wa = Automata.Working_area.create () in + loop ~max:7 wa (Automata.State.create cat re) 'z'; + [%expect + {| + ((TExp (long (Seq (Rep ((0 255))) (Seq (Mark 0) (Alt 122 98)))))) + ((long (TSeq ((TExp (Rep ((0 255))))) (Seq (Mark 0) (Alt 122 98)))) + (TExp ((marks ((0 0)))) Eps)) + ((long (TSeq ((TExp (Rep ((0 255))))) (Seq (Mark 0) (Alt 122 98)))) + (TExp ((marks ((0 1)))) Eps) (TMarks ((marks ((0 0)))))) + ((long (TSeq ((TExp (Rep ((0 255))))) (Seq (Mark 0) (Alt 122 98)))) + (TExp ((marks ((0 0)))) Eps) (TMarks ((marks ((0 1)))))) + ((long (TSeq ((TExp (Rep ((0 255))))) (Seq (Mark 0) (Alt 122 98)))) + (TExp ((marks ((0 1)))) Eps) (TMarks ((marks ((0 0)))))) + ((long (TSeq ((TExp (Rep ((0 255))))) (Seq (Mark 0) (Alt 122 98)))) + (TExp ((marks ((0 0)))) Eps) (TMarks ((marks ((0 1)))))) + ((long (TSeq ((TExp (Rep ((0 255))))) (Seq (Mark 0) (Alt 122 98)))) + (TExp ((marks ((0 1)))) Eps) (TMarks ((marks ((0 0)))))) + |}]; + loop ~max:7 wa (Automata.State.create cat re) 'a'; + [%expect + {| + ((TExp (long (Seq (Rep ((0 255))) (Seq (Mark 0) (Alt 122 98)))))) + ((long (TSeq ((TExp (Rep ((0 255))))) (Seq (Mark 0) (Alt 122 98))))) + ((long (TSeq ((TExp (Rep ((0 255))))) (Seq (Mark 0) (Alt 122 98))))) + ((long (TSeq ((TExp (Rep ((0 255))))) (Seq (Mark 0) (Alt 122 98))))) + ((long (TSeq ((TExp (Rep ((0 255))))) (Seq (Mark 0) (Alt 122 98))))) + ((long (TSeq ((TExp (Rep ((0 255))))) (Seq (Mark 0) (Alt 122 98))))) + ((long (TSeq ((TExp (Rep ((0 255))))) (Seq (Mark 0) (Alt 122 98))))) + |}] diff --git a/lib_test/unicode/expect/test_bit_vector.ml b/lib_test/unicode/expect/test_bit_vector.ml new file mode 100644 index 00000000..1b8d0d25 --- /dev/null +++ b/lib_test/unicode/expect/test_bit_vector.ml @@ -0,0 +1,26 @@ +open! Import +module Bit_vector = Re_private_unicode.Bit_vector + +let%expect_test "reset_zero" = + let n = Bit_vector.create_zero 10 in + let print () = Format.printf "%a@." Bit_vector.pp n in + print (); + [%expect {| + (len 10) + (bits "\000\000") |}]; + Bit_vector.reset_zero n; + print (); + [%expect {| + (len 10) + (bits "\000\000") |}]; + Bit_vector.set n 1 true; + print (); + [%expect {| + (len 10) + (bits "\002\000") |}]; + Bit_vector.reset_zero n; + print (); + [%expect {| + (len 10) + (bits "\000\000") |}] +;; diff --git a/lib_test/unicode/expect/test_category.ml b/lib_test/unicode/expect/test_category.ml new file mode 100644 index 00000000..ceabf190 --- /dev/null +++ b/lib_test/unicode/expect/test_category.ml @@ -0,0 +1,15 @@ +open! Import + + +let%expect_test "Category.from_letter" = + for i = 0 to 255 do + let cat = Re.Category.from_letter Cset.CodePage.(of_int i |> to_letter) in + if Cset.(mem (CodePage.of_int i) cword) then assert (Re.Category.(intersect letter cat)) + done +;; + +let%expect_test "newline" = + let cat = Re.Category.from_letter (Re.char '\n') in + assert (Re.Category.(intersect cat newline)); + assert (Re.Category.(intersect cat not_letter)) +;; diff --git a/lib_test/unicode/expect/test_color.ml b/lib_test/unicode/expect/test_color.ml new file mode 100644 index 00000000..f8c3a4c9 --- /dev/null +++ b/lib_test/unicode/expect/test_color.ml @@ -0,0 +1,45 @@ +open Import + +(* let pp_letters ppf s = + Format.pp_print_iter + ~pp_sep:(fun fmt () -> Format.fprintf fmt ",@ ") + Cset.Codec.iter + (fun fmt letter -> + Format.fprintf fmt "%a" Cset.CodePage.pp + (Cset.CodePage.from_letter letter)) + ppf s *) +let pp_letters ppf s = + Format.pp_print_list + ~pp_sep:(fun fmt () -> Format.fprintf fmt ",@ ") + (fun fmt letter -> + Format.fprintf fmt "%a" Cset.CodePage.pp + (Cset.CodePage.from_letter letter)) + ppf (Cset.Codec.to_list s) + +let l = [ (5, 255) ] + +let all_chars = + let bytes = Bytes.create (256 * 4) in + let pos = ref 0 in + Stdlib.List.iter + (fun (i1, i2) -> + for i = i1 to i2 do + pos := !pos + Cset.Codec.set bytes !pos (Cset.CodePage.of_char @@ Char.chr i) + done) + l; + Bytes.sub_string bytes 0 !pos + +let%expect_test "match an re that distinguishes every single char" = + (* Format.printf "[Unicode.Test_color] all_chars: '%a'\n" pp_letters all_chars; *) + let all_chars = Re.set all_chars in + (* Format.printf "[Unicode.Test_color] set: '%a'\n" Re.pp all_chars; *) + let whole_string = Re.whole_string all_chars in + (* Format.printf "[Unicode.Test_color] whole_string: '%a'\n" Re.pp whole_string; *) + let re = Re.compile whole_string in + (* Format.printf "[Unicode.Test_color] re: '%a'\n" Re.pp_re re; *) + Stdlib.List.iter + (fun (c1, c2) -> + for i = c1 to c2 do + assert (Re.execp re (string_make_of_int i)) + done) + l diff --git a/lib_test/unicode/expect/test_csets.ml b/lib_test/unicode/expect/test_csets.ml new file mode 100644 index 00000000..1481e0e6 --- /dev/null +++ b/lib_test/unicode/expect/test_csets.ml @@ -0,0 +1,1264 @@ +open! Import + + +let%expect_test "empty" = + Format.printf "%a@." Cset.pp Cset.empty; + [%expect {| |}] +;; + +let%expect_test "ascii" = + Format.printf "%a@." Cset.pp Cset.ascii; + [%expect {| 0-127 |}] +;; + +let%expect_test "cdigit" = + Format.printf "%a@." Cset.pp Cset.cdigit; + [%expect {| + 48-57, 1632-1641, 1776-1785, 1984-1993, 2406-2415, 2534-2543, 2662-2671, + 2790-2799, 2918-2927, 3046-3055, 3174-3183, 3302-3311, 3430-3439, 3558-3567, + 3664-3673, 3792-3801, 3872-3881, 4160-4169, 4240-4249, 6112-6121, 6160-6169, + 6470-6479, 6608-6617, 6784-6793, 6800-6809, 6992-7001, 7088-7097, 7232-7241, + 7248-7257, 42528-42537, 43216-43225, 43264-43273, 43472-43481, 43504-43513, + 43600-43609, 44016-44025, 65296-65305, 66720-66729, 68912-68921, 68928-68937, + 69734-69743, 69872-69881, 69942-69951, 70096-70105, 70384-70393, 70736-70745, + 70864-70873, 71248-71257, 71360-71369, 71376-71395, 71472-71481, 71904-71913, + 72016-72025, 72688-72697, 72784-72793, 73040-73049, 73120-73129, 73184-73193, + 73552-73561, 90416-90425, 92768-92777, 92864-92873, 93008-93017, 93552-93561, + 118000-118009, 120782-120831, 123200-123209, 123632-123641, 124144-124153, + 124401-124410, 125264-125273, + 130032-130041 + |}] +;; + +let%expect_test "calpha" = + Format.printf "%a@." Cset.pp Cset.calpha; + [%expect {| + 65-90, 97-122, 170, 181, 186, 192-214, 216-246, 248-705, 710-721, 736-740, + 748, 750, 880-884, 886-887, 890-893, 895, 902, 904-906, 908, 910-929, + 931-1013, 1015-1153, 1162-1327, 1329-1366, 1369, 1376-1416, 1488-1514, + 1519-1522, 1568-1610, 1646-1647, 1649-1747, 1749, 1765-1766, 1774-1775, + 1786-1788, 1791, 1808, 1810-1839, 1869-1957, 1969, 1994-2026, 2036-2037, + 2042, 2048-2069, 2074, 2084, 2088, 2112-2136, 2144-2154, 2160-2183, + 2185-2191, 2208-2249, 2308-2361, 2365, 2384, 2392-2401, 2417-2432, 2437-2444, + 2447-2448, 2451-2472, 2474-2480, 2482, 2486-2489, 2493, 2510, 2524-2525, + 2527-2529, 2544-2545, 2556, 2565-2570, 2575-2576, 2579-2600, 2602-2608, + 2610-2611, 2613-2614, 2616-2617, 2649-2652, 2654, 2674-2676, 2693-2701, + 2703-2705, 2707-2728, 2730-2736, 2738-2739, 2741-2745, 2749, 2768, 2784-2785, + 2809, 2821-2828, 2831-2832, 2835-2856, 2858-2864, 2866-2867, 2869-2873, 2877, + 2908-2909, 2911-2913, 2929, 2947, 2949-2954, 2958-2960, 2962-2965, 2969-2970, + 2972, 2974-2975, 2979-2980, 2984-2986, 2990-3001, 3024, 3077-3084, 3086-3088, + 3090-3112, 3114-3129, 3133, 3160-3162, 3164-3165, 3168-3169, 3200, 3205-3212, + 3214-3216, 3218-3240, 3242-3251, 3253-3257, 3261, 3292-3294, 3296-3297, + 3313-3314, 3332-3340, 3342-3344, 3346-3386, 3389, 3406, 3412-3414, 3423-3425, + 3450-3455, 3461-3478, 3482-3505, 3507-3515, 3517, 3520-3526, 3585-3632, + 3634-3635, 3648-3654, 3713-3714, 3716, 3718-3722, 3724-3747, 3749, 3751-3760, + 3762-3763, 3773, 3776-3780, 3782, 3804-3807, 3840, 3904-3911, 3913-3948, + 3976-3980, 4096-4138, 4159, 4176-4181, 4186-4189, 4193, 4197-4198, 4206-4208, + 4213-4225, 4238, 4256-4293, 4295, 4301, 4304-4346, 4348-4680, 4682-4685, + 4688-4694, 4696, 4698-4701, 4704-4744, 4746-4749, 4752-4784, 4786-4789, + 4792-4798, 4800, 4802-4805, 4808-4822, 4824-4880, 4882-4885, 4888-4954, + 4992-5007, 5024-5109, 5112-5117, 5121-5740, 5743-5759, 5761-5786, 5792-5866, + 5873-5880, 5888-5905, 5919-5937, 5952-5969, 5984-5996, 5998-6000, 6016-6067, + 6103, 6108, 6176-6264, 6272-6276, 6279-6312, 6314, 6320-6389, 6400-6430, + 6480-6509, 6512-6516, 6528-6571, 6576-6601, 6656-6678, 6688-6740, 6823, + 6917-6963, 6981-6988, 7043-7072, 7086-7087, 7098-7141, 7168-7203, 7245-7247, + 7258-7293, 7296-7306, 7312-7354, 7357-7359, 7401-7404, 7406-7411, 7413-7414, + 7418, 7424-7615, 7680-7957, 7960-7965, 7968-8005, 8008-8013, 8016-8023, 8025, + 8027, 8029, 8031-8061, 8064-8116, 8118-8124, 8126, 8130-8132, 8134-8140, + 8144-8147, 8150-8155, 8160-8172, 8178-8180, 8182-8188, 8305, 8319, 8336-8348, + 8450, 8455, 8458-8467, 8469, 8473-8477, 8484, 8486, 8488, 8490-8493, + 8495-8505, 8508-8511, 8517-8521, 8526, 8579-8580, 11264-11492, 11499-11502, + 11506-11507, 11520-11557, 11559, 11565, 11568-11623, 11631, 11648-11670, + 11680-11686, 11688-11694, 11696-11702, 11704-11710, 11712-11718, 11720-11726, + 11728-11734, 11736-11742, 11823, 12293-12294, 12337-12341, 12347-12348, + 12353-12438, 12445-12447, 12449-12538, 12540-12543, 12549-12591, 12593-12686, + 12704-12735, 12784-12799, 13312-19903, 19968-42124, 42192-42237, 42240-42508, + 42512-42527, 42538-42539, 42560-42606, 42623-42653, 42656-42725, 42775-42783, + 42786-42888, 42891-42972, 42993-43009, 43011-43013, 43015-43018, 43020-43042, + 43072-43123, 43138-43187, 43250-43255, 43259, 43261-43262, 43274-43301, + 43312-43334, 43360-43388, 43396-43442, 43471, 43488-43492, 43494-43503, + 43514-43518, 43520-43560, 43584-43586, 43588-43595, 43616-43638, 43642, + 43646-43695, 43697, 43701-43702, 43705-43709, 43712, 43714, 43739-43741, + 43744-43754, 43762-43764, 43777-43782, 43785-43790, 43793-43798, 43808-43814, + 43816-43822, 43824-43866, 43868-43881, 43888-44002, 44032-55203, 55216-55238, + 55243-55291, 63744-64109, 64112-64217, 64256-64262, 64275-64279, 64285, + 64287-64296, 64298-64310, 64312-64316, 64318, 64320-64321, 64323-64324, + 64326-64433, 64467-64829, 64848-64911, 64914-64967, 65008-65019, 65136-65140, + 65142-65276, 65313-65338, 65345-65370, 65382-65470, 65474-65479, 65482-65487, + 65490-65495, 65498-65500, 65536-65547, 65549-65574, 65576-65594, 65596-65597, + 65599-65613, 65616-65629, 65664-65786, 66176-66204, 66208-66256, 66304-66335, + 66349-66368, 66370-66377, 66384-66421, 66432-66461, 66464-66499, 66504-66511, + 66560-66717, 66736-66771, 66776-66811, 66816-66855, 66864-66915, 66928-66938, + 66940-66954, 66956-66962, 66964-66965, 66967-66977, 66979-66993, 66995-67001, + 67003-67004, 67008-67059, 67072-67382, 67392-67413, 67424-67431, 67456-67461, + 67463-67504, 67506-67514, 67584-67589, 67592, 67594-67637, 67639-67640, + 67644, 67647-67669, 67680-67702, 67712-67742, 67808-67826, 67828-67829, + 67840-67861, 67872-67897, 67904-67929, 67968-68023, 68030-68031, 68096, + 68112-68115, 68117-68119, 68121-68149, 68192-68220, 68224-68252, 68288-68295, + 68297-68324, 68352-68405, 68416-68437, 68448-68466, 68480-68497, 68608-68680, + 68736-68786, 68800-68850, 68864-68899, 68938-68965, 68975-68997, 69248-69289, + 69296-69297, 69314-69319, 69376-69404, 69415, 69424-69445, 69488-69505, + 69552-69572, 69600-69622, 69635-69687, 69745-69746, 69749, 69763-69807, + 69840-69864, 69891-69926, 69956, 69959, 69968-70002, 70006, 70019-70066, + 70081-70084, 70106, 70108, 70144-70161, 70163-70187, 70207-70208, + 70272-70278, 70280, 70282-70285, 70287-70301, 70303-70312, 70320-70366, + 70405-70412, 70415-70416, 70419-70440, 70442-70448, 70450-70451, 70453-70457, + 70461, 70480, 70493-70497, 70528-70537, 70539, 70542, 70544-70581, 70583, + 70609, 70611, 70656-70708, 70727-70730, 70751-70753, 70784-70831, + 70852-70853, 70855, 71040-71086, 71128-71131, 71168-71215, 71236, + 71296-71338, 71352, 71424-71450, 71488-71494, 71680-71723, 71840-71903, + 71935-71942, 71945, 71948-71955, 71957-71958, 71960-71983, 71999, 72001, + 72096-72103, 72106-72144, 72161, 72163, 72192, 72203-72242, 72250, 72272, + 72284-72329, 72349, 72368-72440, 72640-72672, 72704-72712, 72714-72750, + 72768, 72818-72847, 72960-72966, 72968-72969, 72971-73008, 73030, + 73056-73061, 73063-73064, 73066-73097, 73112, 73136-73179, 73440-73458, + 73474, 73476-73488, 73490-73523, 73648, 73728-74649, 74880-75075, + 77712-77808, 77824-78895, 78913-78918, 78944-82938, 82944-83526, 90368-90397, + 92160-92728, 92736-92766, 92784-92862, 92880-92909, 92928-92975, 92992-92995, + 93027-93047, 93053-93071, 93504-93548, 93760-93823, 93856-93880, 93883-93907, + 93952-94026, 94032, 94099-94111, 94176-94177, 94179, 94194-94195, + 94208-101589, 101631-101662, 101760-101874, 110576-110579, 110581-110587, + 110589-110590, 110592-110882, 110898, 110928-110930, 110933, 110948-110951, + 110960-111355, 113664-113770, 113776-113788, 113792-113800, 113808-113817, + 119808-119892, 119894-119964, 119966-119967, 119970, 119973-119974, + 119977-119980, 119982-119993, 119995, 119997-120003, 120005-120069, + 120071-120074, 120077-120084, 120086-120092, 120094-120121, 120123-120126, + 120128-120132, 120134, 120138-120144, 120146-120485, 120488-120512, + 120514-120538, 120540-120570, 120572-120596, 120598-120628, 120630-120654, + 120656-120686, 120688-120712, 120714-120744, 120746-120770, 120772-120779, + 122624-122654, 122661-122666, 122928-122989, 123136-123180, 123191-123197, + 123214, 123536-123565, 123584-123627, 124112-124139, 124368-124397, 124400, + 124608-124638, 124640-124642, 124644-124645, 124647-124653, 124656-124660, + 124670-124671, 124896-124902, 124904-124907, 124909-124910, 124912-124926, + 124928-125124, 125184-125251, 125259, 126464-126467, 126469-126495, + 126497-126498, 126500, 126503, 126505-126514, 126516-126519, 126521, 126523, + 126530, 126535, 126537, 126539, 126541-126543, 126545-126546, 126548, 126551, + 126553, 126555, 126557, 126559, 126561-126562, 126564, 126567-126570, + 126572-126578, 126580-126583, 126585-126588, 126590, 126592-126601, + 126603-126619, 126625-126627, 126629-126633, 126635-126651, 131072-173791, + 173824-178205, 178208-183981, 183984-191456, 191472-192093, 194560-195101, + 196608-201546, + 201552-210041 + |}] +;; + +let%expect_test "cword" = + Format.printf "%a@." Cset.pp Cset.cword; + [%expect {| + 48-57, 65-90, 95, 97-122, 170, 178-179, 181, 185-186, 188-190, 192-214, + 216-246, 248-705, 710-721, 736-740, 748, 750, 880-884, 886-887, 890-893, 895, + 902, 904-906, 908, 910-929, 931-1013, 1015-1153, 1162-1327, 1329-1366, 1369, + 1376-1416, 1488-1514, 1519-1522, 1568-1610, 1632-1641, 1646-1647, 1649-1747, + 1749, 1765-1766, 1774-1788, 1791, 1808, 1810-1839, 1869-1957, 1969, + 1984-2026, 2036-2037, 2042, 2048-2069, 2074, 2084, 2088, 2112-2136, + 2144-2154, 2160-2183, 2185-2191, 2208-2249, 2308-2361, 2365, 2384, 2392-2401, + 2406-2415, 2417-2432, 2437-2444, 2447-2448, 2451-2472, 2474-2480, 2482, + 2486-2489, 2493, 2510, 2524-2525, 2527-2529, 2534-2545, 2548-2553, 2556, + 2565-2570, 2575-2576, 2579-2600, 2602-2608, 2610-2611, 2613-2614, 2616-2617, + 2649-2652, 2654, 2662-2671, 2674-2676, 2693-2701, 2703-2705, 2707-2728, + 2730-2736, 2738-2739, 2741-2745, 2749, 2768, 2784-2785, 2790-2799, 2809, + 2821-2828, 2831-2832, 2835-2856, 2858-2864, 2866-2867, 2869-2873, 2877, + 2908-2909, 2911-2913, 2918-2927, 2929-2935, 2947, 2949-2954, 2958-2960, + 2962-2965, 2969-2970, 2972, 2974-2975, 2979-2980, 2984-2986, 2990-3001, 3024, + 3046-3058, 3077-3084, 3086-3088, 3090-3112, 3114-3129, 3133, 3160-3162, + 3164-3165, 3168-3169, 3174-3183, 3192-3198, 3200, 3205-3212, 3214-3216, + 3218-3240, 3242-3251, 3253-3257, 3261, 3292-3294, 3296-3297, 3302-3311, + 3313-3314, 3332-3340, 3342-3344, 3346-3386, 3389, 3406, 3412-3414, 3416-3425, + 3430-3448, 3450-3455, 3461-3478, 3482-3505, 3507-3515, 3517, 3520-3526, + 3558-3567, 3585-3632, 3634-3635, 3648-3654, 3664-3673, 3713-3714, 3716, + 3718-3722, 3724-3747, 3749, 3751-3760, 3762-3763, 3773, 3776-3780, 3782, + 3792-3801, 3804-3807, 3840, 3872-3891, 3904-3911, 3913-3948, 3976-3980, + 4096-4138, 4159-4169, 4176-4181, 4186-4189, 4193, 4197-4198, 4206-4208, + 4213-4225, 4238, 4240-4249, 4256-4293, 4295, 4301, 4304-4346, 4348-4680, + 4682-4685, 4688-4694, 4696, 4698-4701, 4704-4744, 4746-4749, 4752-4784, + 4786-4789, 4792-4798, 4800, 4802-4805, 4808-4822, 4824-4880, 4882-4885, + 4888-4954, 4969-4988, 4992-5007, 5024-5109, 5112-5117, 5121-5740, 5743-5759, + 5761-5786, 5792-5866, 5870-5880, 5888-5905, 5919-5937, 5952-5969, 5984-5996, + 5998-6000, 6016-6067, 6103, 6108, 6112-6121, 6128-6137, 6160-6169, 6176-6264, + 6272-6276, 6279-6312, 6314, 6320-6389, 6400-6430, 6470-6509, 6512-6516, + 6528-6571, 6576-6601, 6608-6618, 6656-6678, 6688-6740, 6784-6793, 6800-6809, + 6823, 6917-6963, 6981-6988, 6992-7001, 7043-7072, 7086-7141, 7168-7203, + 7232-7241, 7245-7293, 7296-7306, 7312-7354, 7357-7359, 7401-7404, 7406-7411, + 7413-7414, 7418, 7424-7615, 7680-7957, 7960-7965, 7968-8005, 8008-8013, + 8016-8023, 8025, 8027, 8029, 8031-8061, 8064-8116, 8118-8124, 8126, + 8130-8132, 8134-8140, 8144-8147, 8150-8155, 8160-8172, 8178-8180, 8182-8188, + 8304-8305, 8308-8313, 8319-8329, 8336-8348, 8450, 8455, 8458-8467, 8469, + 8473-8477, 8484, 8486, 8488, 8490-8493, 8495-8505, 8508-8511, 8517-8521, + 8526, 8528-8585, 9312-9371, 9450-9471, 10102-10131, 11264-11492, 11499-11502, + 11506-11507, 11517, 11520-11557, 11559, 11565, 11568-11623, 11631, + 11648-11670, 11680-11686, 11688-11694, 11696-11702, 11704-11710, 11712-11718, + 11720-11726, 11728-11734, 11736-11742, 11823, 12293-12295, 12321-12329, + 12337-12341, 12344-12348, 12353-12438, 12445-12447, 12449-12538, 12540-12543, + 12549-12591, 12593-12686, 12690-12693, 12704-12735, 12784-12799, 12832-12841, + 12872-12879, 12881-12895, 12928-12937, 12977-12991, 13312-19903, 19968-42124, + 42192-42237, 42240-42508, 42512-42539, 42560-42606, 42623-42653, 42656-42735, + 42775-42783, 42786-42888, 42891-42972, 42993-43009, 43011-43013, 43015-43018, + 43020-43042, 43056-43061, 43072-43123, 43138-43187, 43216-43225, 43250-43255, + 43259, 43261-43262, 43264-43301, 43312-43334, 43360-43388, 43396-43442, + 43471-43481, 43488-43492, 43494-43518, 43520-43560, 43584-43586, 43588-43595, + 43600-43609, 43616-43638, 43642, 43646-43695, 43697, 43701-43702, + 43705-43709, 43712, 43714, 43739-43741, 43744-43754, 43762-43764, + 43777-43782, 43785-43790, 43793-43798, 43808-43814, 43816-43822, 43824-43866, + 43868-43881, 43888-44002, 44016-44025, 44032-55203, 55216-55238, 55243-55291, + 63744-64109, 64112-64217, 64256-64262, 64275-64279, 64285, 64287-64296, + 64298-64310, 64312-64316, 64318, 64320-64321, 64323-64324, 64326-64433, + 64467-64829, 64848-64911, 64914-64967, 65008-65019, 65136-65140, 65142-65276, + 65296-65305, 65313-65338, 65345-65370, 65382-65470, 65474-65479, 65482-65487, + 65490-65495, 65498-65500, 65536-65547, 65549-65574, 65576-65594, 65596-65597, + 65599-65613, 65616-65629, 65664-65786, 65799-65843, 65856-65912, 65930-65931, + 66176-66204, 66208-66256, 66273-66299, 66304-66339, 66349-66378, 66384-66421, + 66432-66461, 66464-66499, 66504-66511, 66513-66517, 66560-66717, 66720-66729, + 66736-66771, 66776-66811, 66816-66855, 66864-66915, 66928-66938, 66940-66954, + 66956-66962, 66964-66965, 66967-66977, 66979-66993, 66995-67001, 67003-67004, + 67008-67059, 67072-67382, 67392-67413, 67424-67431, 67456-67461, 67463-67504, + 67506-67514, 67584-67589, 67592, 67594-67637, 67639-67640, 67644, + 67647-67669, 67672-67702, 67705-67742, 67751-67759, 67808-67826, 67828-67829, + 67835-67867, 67872-67897, 67904-67929, 67968-68023, 68028-68047, 68050-68096, + 68112-68115, 68117-68119, 68121-68149, 68160-68168, 68192-68222, 68224-68255, + 68288-68295, 68297-68324, 68331-68335, 68352-68405, 68416-68437, 68440-68466, + 68472-68497, 68521-68527, 68608-68680, 68736-68786, 68800-68850, 68858-68899, + 68912-68921, 68928-68965, 68975-68997, 69216-69246, 69248-69289, 69296-69297, + 69314-69319, 69376-69415, 69424-69445, 69457-69460, 69488-69505, 69552-69579, + 69600-69622, 69635-69687, 69714-69743, 69745-69746, 69749, 69763-69807, + 69840-69864, 69872-69881, 69891-69926, 69942-69951, 69956, 69959, + 69968-70002, 70006, 70019-70066, 70081-70084, 70096-70106, 70108, + 70113-70132, 70144-70161, 70163-70187, 70207-70208, 70272-70278, 70280, + 70282-70285, 70287-70301, 70303-70312, 70320-70366, 70384-70393, 70405-70412, + 70415-70416, 70419-70440, 70442-70448, 70450-70451, 70453-70457, 70461, + 70480, 70493-70497, 70528-70537, 70539, 70542, 70544-70581, 70583, 70609, + 70611, 70656-70708, 70727-70730, 70736-70745, 70751-70753, 70784-70831, + 70852-70853, 70855, 70864-70873, 71040-71086, 71128-71131, 71168-71215, + 71236, 71248-71257, 71296-71338, 71352, 71360-71369, 71376-71395, + 71424-71450, 71472-71483, 71488-71494, 71680-71723, 71840-71922, 71935-71942, + 71945, 71948-71955, 71957-71958, 71960-71983, 71999, 72001, 72016-72025, + 72096-72103, 72106-72144, 72161, 72163, 72192, 72203-72242, 72250, 72272, + 72284-72329, 72349, 72368-72440, 72640-72672, 72688-72697, 72704-72712, + 72714-72750, 72768, 72784-72812, 72818-72847, 72960-72966, 72968-72969, + 72971-73008, 73030, 73040-73049, 73056-73061, 73063-73064, 73066-73097, + 73112, 73120-73129, 73136-73179, 73184-73193, 73440-73458, 73474, + 73476-73488, 73490-73523, 73552-73561, 73648, 73664-73684, 73728-74649, + 74752-74862, 74880-75075, 77712-77808, 77824-78895, 78913-78918, 78944-82938, + 82944-83526, 90368-90397, 90416-90425, 92160-92728, 92736-92766, 92768-92777, + 92784-92862, 92864-92873, 92880-92909, 92928-92975, 92992-92995, 93008-93017, + 93019-93025, 93027-93047, 93053-93071, 93504-93548, 93552-93561, 93760-93846, + 93856-93880, 93883-93907, 93952-94026, 94032, 94099-94111, 94176-94177, + 94179, 94194-94198, 94208-101589, 101631-101662, 101760-101874, + 110576-110579, 110581-110587, 110589-110590, 110592-110882, 110898, + 110928-110930, 110933, 110948-110951, 110960-111355, 113664-113770, + 113776-113788, 113792-113800, 113808-113817, 118000-118009, 119488-119507, + 119520-119539, 119648-119672, 119808-119892, 119894-119964, 119966-119967, + 119970, 119973-119974, 119977-119980, 119982-119993, 119995, 119997-120003, + 120005-120069, 120071-120074, 120077-120084, 120086-120092, 120094-120121, + 120123-120126, 120128-120132, 120134, 120138-120144, 120146-120485, + 120488-120512, 120514-120538, 120540-120570, 120572-120596, 120598-120628, + 120630-120654, 120656-120686, 120688-120712, 120714-120744, 120746-120770, + 120772-120779, 120782-120831, 122624-122654, 122661-122666, 122928-122989, + 123136-123180, 123191-123197, 123200-123209, 123214, 123536-123565, + 123584-123627, 123632-123641, 124112-124139, 124144-124153, 124368-124397, + 124400-124410, 124608-124638, 124640-124642, 124644-124645, 124647-124653, + 124656-124660, 124670-124671, 124896-124902, 124904-124907, 124909-124910, + 124912-124926, 124928-125124, 125127-125135, 125184-125251, 125259, + 125264-125273, 126065-126123, 126125-126127, 126129-126132, 126209-126253, + 126255-126269, 126464-126467, 126469-126495, 126497-126498, 126500, 126503, + 126505-126514, 126516-126519, 126521, 126523, 126530, 126535, 126537, 126539, + 126541-126543, 126545-126546, 126548, 126551, 126553, 126555, 126557, 126559, + 126561-126562, 126564, 126567-126570, 126572-126578, 126580-126583, + 126585-126588, 126590, 126592-126601, 126603-126619, 126625-126627, + 126629-126633, 126635-126651, 127232-127244, 130032-130041, 131072-173791, + 173824-178205, 178208-183981, 183984-191456, 191472-192093, 194560-195101, + 196608-201546, + 201552-210041 + |}] +;; + +let%expect_test "notnl" = + Format.printf "%a@." Cset.pp Cset.notnl; + [%expect {| + 0-9, 14-132, 134-8231, 8234-55295, + 57344-1114111 + |}] +;; + +let%expect_test "nl" = + Format.printf "%a@." Cset.pp Cset.nl; + [%expect {| + 10-13, 133, + 8232-8233 + |}] +;; + +let%expect_test "blank" = + Format.printf "%a@." Cset.pp Cset.blank; + [%expect {| + 9, + 32 + |}] +;; + +let%expect_test "space" = + Format.printf "%a@." Cset.pp Cset.space; + [%expect {| + 9-13, 32, 133, 160, 5760, 8192-8202, 8232-8233, 8239, 8287, + 12288 + |}] +;; + +let%expect_test "xdigit" = + Format.printf "%a@." Cset.pp Cset.xdigit; + [%expect {| + 48-57, 65-70, 97-102, 65296-65305, 65313-65318, + 65345-65350 + |}] +;; + +let%expect_test "lower" = + Format.printf "%a@." Cset.pp Cset.lower; + [%expect {| + 97-122, 170, 181, 186, 223-246, 248-255, 257, 259, 261, 263, 265, 267, 269, + 271, 273, 275, 277, 279, 281, 283, 285, 287, 289, 291, 293, 295, 297, 299, + 301, 303, 305, 307, 309, 311-312, 314, 316, 318, 320, 322, 324, 326, 328-329, + 331, 333, 335, 337, 339, 341, 343, 345, 347, 349, 351, 353, 355, 357, 359, + 361, 363, 365, 367, 369, 371, 373, 375, 378, 380, 382-384, 387, 389, 392, + 396-397, 402, 405, 409-411, 414, 417, 419, 421, 424, 426-427, 429, 432, 436, + 438, 441-442, 445-447, 454, 457, 460, 462, 464, 466, 468, 470, 472, 474, + 476-477, 479, 481, 483, 485, 487, 489, 491, 493, 495-496, 499, 501, 505, 507, + 509, 511, 513, 515, 517, 519, 521, 523, 525, 527, 529, 531, 533, 535, 537, + 539, 541, 543, 545, 547, 549, 551, 553, 555, 557, 559, 561, 563-569, 572, + 575-576, 578, 583, 585, 587, 589, 591-659, 662-696, 704-705, 736-740, 837, + 881, 883, 887, 890-893, 912, 940-974, 976-977, 981-983, 985, 987, 989, 991, + 993, 995, 997, 999, 1001, 1003, 1005, 1007-1011, 1013, 1016, 1019-1020, + 1072-1119, 1121, 1123, 1125, 1127, 1129, 1131, 1133, 1135, 1137, 1139, 1141, + 1143, 1145, 1147, 1149, 1151, 1153, 1163, 1165, 1167, 1169, 1171, 1173, 1175, + 1177, 1179, 1181, 1183, 1185, 1187, 1189, 1191, 1193, 1195, 1197, 1199, 1201, + 1203, 1205, 1207, 1209, 1211, 1213, 1215, 1218, 1220, 1222, 1224, 1226, 1228, + 1230-1231, 1233, 1235, 1237, 1239, 1241, 1243, 1245, 1247, 1249, 1251, 1253, + 1255, 1257, 1259, 1261, 1263, 1265, 1267, 1269, 1271, 1273, 1275, 1277, 1279, + 1281, 1283, 1285, 1287, 1289, 1291, 1293, 1295, 1297, 1299, 1301, 1303, 1305, + 1307, 1309, 1311, 1313, 1315, 1317, 1319, 1321, 1323, 1325, 1327, 1376-1416, + 4304-4346, 4348-4351, 5112-5117, 7296-7304, 7306, 7424-7615, 7681, 7683, + 7685, 7687, 7689, 7691, 7693, 7695, 7697, 7699, 7701, 7703, 7705, 7707, 7709, + 7711, 7713, 7715, 7717, 7719, 7721, 7723, 7725, 7727, 7729, 7731, 7733, 7735, + 7737, 7739, 7741, 7743, 7745, 7747, 7749, 7751, 7753, 7755, 7757, 7759, 7761, + 7763, 7765, 7767, 7769, 7771, 7773, 7775, 7777, 7779, 7781, 7783, 7785, 7787, + 7789, 7791, 7793, 7795, 7797, 7799, 7801, 7803, 7805, 7807, 7809, 7811, 7813, + 7815, 7817, 7819, 7821, 7823, 7825, 7827, 7829-7837, 7839, 7841, 7843, 7845, + 7847, 7849, 7851, 7853, 7855, 7857, 7859, 7861, 7863, 7865, 7867, 7869, 7871, + 7873, 7875, 7877, 7879, 7881, 7883, 7885, 7887, 7889, 7891, 7893, 7895, 7897, + 7899, 7901, 7903, 7905, 7907, 7909, 7911, 7913, 7915, 7917, 7919, 7921, 7923, + 7925, 7927, 7929, 7931, 7933, 7935-7943, 7952-7957, 7968-7975, 7984-7991, + 8000-8005, 8016-8023, 8032-8039, 8048-8061, 8064-8071, 8080-8087, 8096-8103, + 8112-8116, 8118-8119, 8126, 8130-8132, 8134-8135, 8144-8147, 8150-8151, + 8160-8167, 8178-8180, 8182-8183, 8305, 8319, 8336-8348, 8458, 8462-8463, + 8467, 8495, 8500, 8505, 8508-8509, 8518-8521, 8526, 8560-8575, 8580, + 9424-9449, 11312-11359, 11361, 11365-11366, 11368, 11370, 11372, 11377, + 11379-11380, 11382-11389, 11393, 11395, 11397, 11399, 11401, 11403, 11405, + 11407, 11409, 11411, 11413, 11415, 11417, 11419, 11421, 11423, 11425, 11427, + 11429, 11431, 11433, 11435, 11437, 11439, 11441, 11443, 11445, 11447, 11449, + 11451, 11453, 11455, 11457, 11459, 11461, 11463, 11465, 11467, 11469, 11471, + 11473, 11475, 11477, 11479, 11481, 11483, 11485, 11487, 11489, 11491-11492, + 11500, 11502, 11507, 11520-11557, 11559, 11565, 42561, 42563, 42565, 42567, + 42569, 42571, 42573, 42575, 42577, 42579, 42581, 42583, 42585, 42587, 42589, + 42591, 42593, 42595, 42597, 42599, 42601, 42603, 42605, 42625, 42627, 42629, + 42631, 42633, 42635, 42637, 42639, 42641, 42643, 42645, 42647, 42649, + 42651-42653, 42787, 42789, 42791, 42793, 42795, 42797, 42799-42801, 42803, + 42805, 42807, 42809, 42811, 42813, 42815, 42817, 42819, 42821, 42823, 42825, + 42827, 42829, 42831, 42833, 42835, 42837, 42839, 42841, 42843, 42845, 42847, + 42849, 42851, 42853, 42855, 42857, 42859, 42861, 42863-42872, 42874, 42876, + 42879, 42881, 42883, 42885, 42887, 42892, 42894, 42897, 42899-42901, 42903, + 42905, 42907, 42909, 42911, 42913, 42915, 42917, 42919, 42921, 42927, 42933, + 42935, 42937, 42939, 42941, 42943, 42945, 42947, 42952, 42954, 42957, 42959, + 42961, 42963, 42965, 42967, 42969, 42971, 42993-42996, 42998, 43000-43002, + 43824-43866, 43868-43881, 43888-43967, 64256-64262, 64275-64279, 65345-65370, + 66600-66639, 66776-66811, 66967-66977, 66979-66993, 66995-67001, 67003-67004, + 67456, 67459-67461, 67463-67504, 67506-67514, 68800-68850, 68976-68997, + 71872-71903, 93792-93823, 93883-93907, 119834-119859, 119886-119892, + 119894-119911, 119938-119963, 119990-119993, 119995, 119997-120003, + 120005-120015, 120042-120067, 120094-120119, 120146-120171, 120198-120223, + 120250-120275, 120302-120327, 120354-120379, 120406-120431, 120458-120485, + 120514-120538, 120540-120545, 120572-120596, 120598-120603, 120630-120654, + 120656-120661, 120688-120712, 120714-120719, 120746-120770, 120772-120777, + 120779, 122624-122633, 122635-122654, 122661-122666, 122928-122989, + 125218-125251 + |}] +;; + +let%expect_test "upper" = + Format.printf "%a@." Cset.pp Cset.upper; + [%expect {| + 65-90, 192-214, 216-222, 256, 258, 260, 262, 264, 266, 268, 270, 272, 274, + 276, 278, 280, 282, 284, 286, 288, 290, 292, 294, 296, 298, 300, 302, 304, + 306, 308, 310, 313, 315, 317, 319, 321, 323, 325, 327, 330, 332, 334, 336, + 338, 340, 342, 344, 346, 348, 350, 352, 354, 356, 358, 360, 362, 364, 366, + 368, 370, 372, 374, 376-377, 379, 381, 385-386, 388, 390-391, 393-395, + 398-401, 403-404, 406-408, 412-413, 415-416, 418, 420, 422-423, 425, 428, + 430-431, 433-435, 437, 439-440, 444, 452, 455, 458, 461, 463, 465, 467, 469, + 471, 473, 475, 478, 480, 482, 484, 486, 488, 490, 492, 494, 497, 500, + 502-504, 506, 508, 510, 512, 514, 516, 518, 520, 522, 524, 526, 528, 530, + 532, 534, 536, 538, 540, 542, 544, 546, 548, 550, 552, 554, 556, 558, 560, + 562, 570-571, 573-574, 577, 579-582, 584, 586, 588, 590, 880, 882, 886, 895, + 902, 904-906, 908, 910-911, 913-929, 931-939, 975, 978-980, 984, 986, 988, + 990, 992, 994, 996, 998, 1000, 1002, 1004, 1006, 1012, 1015, 1017-1018, + 1021-1071, 1120, 1122, 1124, 1126, 1128, 1130, 1132, 1134, 1136, 1138, 1140, + 1142, 1144, 1146, 1148, 1150, 1152, 1162, 1164, 1166, 1168, 1170, 1172, 1174, + 1176, 1178, 1180, 1182, 1184, 1186, 1188, 1190, 1192, 1194, 1196, 1198, 1200, + 1202, 1204, 1206, 1208, 1210, 1212, 1214, 1216-1217, 1219, 1221, 1223, 1225, + 1227, 1229, 1232, 1234, 1236, 1238, 1240, 1242, 1244, 1246, 1248, 1250, 1252, + 1254, 1256, 1258, 1260, 1262, 1264, 1266, 1268, 1270, 1272, 1274, 1276, 1278, + 1280, 1282, 1284, 1286, 1288, 1290, 1292, 1294, 1296, 1298, 1300, 1302, 1304, + 1306, 1308, 1310, 1312, 1314, 1316, 1318, 1320, 1322, 1324, 1326, 1329-1366, + 4256-4293, 4295, 4301, 5024-5109, 7305, 7312-7354, 7357-7359, 7680, 7682, + 7684, 7686, 7688, 7690, 7692, 7694, 7696, 7698, 7700, 7702, 7704, 7706, 7708, + 7710, 7712, 7714, 7716, 7718, 7720, 7722, 7724, 7726, 7728, 7730, 7732, 7734, + 7736, 7738, 7740, 7742, 7744, 7746, 7748, 7750, 7752, 7754, 7756, 7758, 7760, + 7762, 7764, 7766, 7768, 7770, 7772, 7774, 7776, 7778, 7780, 7782, 7784, 7786, + 7788, 7790, 7792, 7794, 7796, 7798, 7800, 7802, 7804, 7806, 7808, 7810, 7812, + 7814, 7816, 7818, 7820, 7822, 7824, 7826, 7828, 7838, 7840, 7842, 7844, 7846, + 7848, 7850, 7852, 7854, 7856, 7858, 7860, 7862, 7864, 7866, 7868, 7870, 7872, + 7874, 7876, 7878, 7880, 7882, 7884, 7886, 7888, 7890, 7892, 7894, 7896, 7898, + 7900, 7902, 7904, 7906, 7908, 7910, 7912, 7914, 7916, 7918, 7920, 7922, 7924, + 7926, 7928, 7930, 7932, 7934, 7944-7951, 7960-7965, 7976-7983, 7992-7999, + 8008-8013, 8025, 8027, 8029, 8031, 8040-8047, 8120-8123, 8136-8139, + 8152-8155, 8168-8172, 8184-8187, 8450, 8455, 8459-8461, 8464-8466, 8469, + 8473-8477, 8484, 8486, 8488, 8490-8493, 8496-8499, 8510-8511, 8517, + 8544-8559, 8579, 9398-9423, 11264-11311, 11360, 11362-11364, 11367, 11369, + 11371, 11373-11376, 11378, 11381, 11390-11392, 11394, 11396, 11398, 11400, + 11402, 11404, 11406, 11408, 11410, 11412, 11414, 11416, 11418, 11420, 11422, + 11424, 11426, 11428, 11430, 11432, 11434, 11436, 11438, 11440, 11442, 11444, + 11446, 11448, 11450, 11452, 11454, 11456, 11458, 11460, 11462, 11464, 11466, + 11468, 11470, 11472, 11474, 11476, 11478, 11480, 11482, 11484, 11486, 11488, + 11490, 11499, 11501, 11506, 42560, 42562, 42564, 42566, 42568, 42570, 42572, + 42574, 42576, 42578, 42580, 42582, 42584, 42586, 42588, 42590, 42592, 42594, + 42596, 42598, 42600, 42602, 42604, 42624, 42626, 42628, 42630, 42632, 42634, + 42636, 42638, 42640, 42642, 42644, 42646, 42648, 42650, 42786, 42788, 42790, + 42792, 42794, 42796, 42798, 42802, 42804, 42806, 42808, 42810, 42812, 42814, + 42816, 42818, 42820, 42822, 42824, 42826, 42828, 42830, 42832, 42834, 42836, + 42838, 42840, 42842, 42844, 42846, 42848, 42850, 42852, 42854, 42856, 42858, + 42860, 42862, 42873, 42875, 42877-42878, 42880, 42882, 42884, 42886, 42891, + 42893, 42896, 42898, 42902, 42904, 42906, 42908, 42910, 42912, 42914, 42916, + 42918, 42920, 42922-42926, 42928-42932, 42934, 42936, 42938, 42940, 42942, + 42944, 42946, 42948-42951, 42953, 42955-42956, 42958, 42960, 42962, 42964, + 42966, 42968, 42970, 42972, 42997, 65313-65338, 66560-66599, 66736-66771, + 66928-66938, 66940-66954, 66956-66962, 66964-66965, 68736-68786, 68944-68965, + 71840-71871, 93760-93791, 93856-93880, 119808-119833, 119860-119885, + 119912-119937, 119964, 119966-119967, 119970, 119973-119974, 119977-119980, + 119982-119989, 120016-120041, 120068-120069, 120071-120074, 120077-120084, + 120086-120092, 120120-120121, 120123-120126, 120128-120132, 120134, + 120138-120144, 120172-120197, 120224-120249, 120276-120301, 120328-120353, + 120380-120405, 120432-120457, 120488-120512, 120546-120570, 120604-120628, + 120662-120686, 120720-120744, 120778, 125184-125217, 127280-127305, + 127312-127337, + 127344-127369 + |}] +;; + +let%expect_test "alpha" = + Format.printf "%a@." Cset.pp Cset.alpha; + [%expect {| + 65-90, 97-122, 170, 181, 186, 192-214, 216-246, 248-705, 710-721, 736-740, + 748, 750, 837, 867-884, 886-887, 890-893, 895, 902, 904-906, 908, 910-929, + 931-1013, 1015-1153, 1162-1327, 1329-1366, 1369, 1376-1416, 1456-1469, 1471, + 1473-1474, 1476-1477, 1479, 1488-1514, 1519-1522, 1552-1562, 1568-1623, + 1625-1631, 1646-1747, 1749-1756, 1761-1768, 1773-1775, 1786-1788, 1791, + 1808-1855, 1869-1969, 1994-2026, 2036-2037, 2042, 2048-2071, 2074-2092, + 2112-2136, 2144-2154, 2160-2183, 2185-2191, 2199, 2208-2249, 2260-2271, + 2275-2281, 2288-2363, 2365-2380, 2382-2384, 2389-2403, 2417-2435, 2437-2444, + 2447-2448, 2451-2472, 2474-2480, 2482, 2486-2489, 2493-2500, 2503-2504, + 2507-2508, 2510, 2519, 2524-2525, 2527-2531, 2544-2545, 2556, 2561-2563, + 2565-2570, 2575-2576, 2579-2600, 2602-2608, 2610-2611, 2613-2614, 2616-2617, + 2622-2626, 2631-2632, 2635-2636, 2641, 2649-2652, 2654, 2672-2677, 2689-2691, + 2693-2701, 2703-2705, 2707-2728, 2730-2736, 2738-2739, 2741-2745, 2749-2757, + 2759-2761, 2763-2764, 2768, 2784-2787, 2809-2812, 2817-2819, 2821-2828, + 2831-2832, 2835-2856, 2858-2864, 2866-2867, 2869-2873, 2877-2884, 2887-2888, + 2891-2892, 2902-2903, 2908-2909, 2911-2915, 2929, 2946-2947, 2949-2954, + 2958-2960, 2962-2965, 2969-2970, 2972, 2974-2975, 2979-2980, 2984-2986, + 2990-3001, 3006-3010, 3014-3016, 3018-3020, 3024, 3031, 3072-3084, 3086-3088, + 3090-3112, 3114-3129, 3133-3140, 3142-3144, 3146-3148, 3157-3158, 3160-3162, + 3164-3165, 3168-3171, 3200-3203, 3205-3212, 3214-3216, 3218-3240, 3242-3251, + 3253-3257, 3261-3268, 3270-3272, 3274-3276, 3285-3286, 3292-3294, 3296-3299, + 3313-3315, 3328-3340, 3342-3344, 3346-3386, 3389-3396, 3398-3400, 3402-3404, + 3406, 3412-3415, 3423-3427, 3450-3455, 3457-3459, 3461-3478, 3482-3505, + 3507-3515, 3517, 3520-3526, 3535-3540, 3542, 3544-3551, 3570-3571, 3585-3642, + 3648-3654, 3661, 3713-3714, 3716, 3718-3722, 3724-3747, 3749, 3751-3769, + 3771-3773, 3776-3780, 3782, 3789, 3804-3807, 3840, 3904-3911, 3913-3948, + 3953-3971, 3976-3991, 3993-4028, 4096-4150, 4152, 4155-4159, 4176-4239, + 4250-4253, 4256-4293, 4295, 4301, 4304-4346, 4348-4680, 4682-4685, 4688-4694, + 4696, 4698-4701, 4704-4744, 4746-4749, 4752-4784, 4786-4789, 4792-4798, 4800, + 4802-4805, 4808-4822, 4824-4880, 4882-4885, 4888-4954, 4992-5007, 5024-5109, + 5112-5117, 5121-5740, 5743-5759, 5761-5786, 5792-5866, 5870-5880, 5888-5907, + 5919-5939, 5952-5971, 5984-5996, 5998-6000, 6002-6003, 6016-6067, 6070-6088, + 6103, 6108, 6176-6264, 6272-6314, 6320-6389, 6400-6430, 6432-6443, 6448-6456, + 6480-6509, 6512-6516, 6528-6571, 6576-6601, 6656-6683, 6688-6750, 6753-6772, + 6823, 6847-6848, 6860-6862, 6912-6963, 6965-6979, 6981-6988, 7040-7081, + 7084-7087, 7098-7141, 7143-7153, 7168-7222, 7245-7247, 7258-7293, 7296-7306, + 7312-7354, 7357-7359, 7401-7404, 7406-7411, 7413-7414, 7418, 7424-7615, + 7635-7668, 7680-7957, 7960-7965, 7968-8005, 8008-8013, 8016-8023, 8025, 8027, + 8029, 8031-8061, 8064-8116, 8118-8124, 8126, 8130-8132, 8134-8140, 8144-8147, + 8150-8155, 8160-8172, 8178-8180, 8182-8188, 8305, 8319, 8336-8348, 8450, + 8455, 8458-8467, 8469, 8473-8477, 8484, 8486, 8488, 8490-8493, 8495-8505, + 8508-8511, 8517-8521, 8526, 8544-8584, 9398-9449, 11264-11492, 11499-11502, + 11506-11507, 11520-11557, 11559, 11565, 11568-11623, 11631, 11648-11670, + 11680-11686, 11688-11694, 11696-11702, 11704-11710, 11712-11718, 11720-11726, + 11728-11734, 11736-11742, 11744-11775, 11823, 12293-12295, 12321-12329, + 12337-12341, 12344-12348, 12353-12438, 12445-12447, 12449-12538, 12540-12543, + 12549-12591, 12593-12686, 12704-12735, 12784-12799, 13312-19903, 19968-42124, + 42192-42237, 42240-42508, 42512-42527, 42538-42539, 42560-42606, 42612-42619, + 42623-42735, 42775-42783, 42786-42888, 42891-42972, 42993-43013, 43015-43047, + 43072-43123, 43136-43203, 43205, 43250-43255, 43259, 43261-43263, + 43274-43306, 43312-43346, 43360-43388, 43392-43442, 43444-43455, 43471, + 43488-43503, 43514-43518, 43520-43574, 43584-43597, 43616-43638, 43642-43710, + 43712, 43714, 43739-43741, 43744-43759, 43762-43765, 43777-43782, + 43785-43790, 43793-43798, 43808-43814, 43816-43822, 43824-43866, 43868-43881, + 43888-44010, 44032-55203, 55216-55238, 55243-55291, 63744-64109, 64112-64217, + 64256-64262, 64275-64279, 64285-64296, 64298-64310, 64312-64316, 64318, + 64320-64321, 64323-64324, 64326-64433, 64467-64829, 64848-64911, 64914-64967, + 65008-65019, 65136-65140, 65142-65276, 65313-65338, 65345-65370, 65382-65470, + 65474-65479, 65482-65487, 65490-65495, 65498-65500, 65536-65547, 65549-65574, + 65576-65594, 65596-65597, 65599-65613, 65616-65629, 65664-65786, 65856-65908, + 66176-66204, 66208-66256, 66304-66335, 66349-66378, 66384-66426, 66432-66461, + 66464-66499, 66504-66511, 66513-66517, 66560-66717, 66736-66771, 66776-66811, + 66816-66855, 66864-66915, 66928-66938, 66940-66954, 66956-66962, 66964-66965, + 66967-66977, 66979-66993, 66995-67001, 67003-67004, 67008-67059, 67072-67382, + 67392-67413, 67424-67431, 67456-67461, 67463-67504, 67506-67514, 67584-67589, + 67592, 67594-67637, 67639-67640, 67644, 67647-67669, 67680-67702, + 67712-67742, 67808-67826, 67828-67829, 67840-67861, 67872-67897, 67904-67929, + 67968-68023, 68030-68031, 68096-68099, 68101-68102, 68108-68115, 68117-68119, + 68121-68149, 68192-68220, 68224-68252, 68288-68295, 68297-68324, 68352-68405, + 68416-68437, 68448-68466, 68480-68497, 68608-68680, 68736-68786, 68800-68850, + 68864-68903, 68938-68965, 68969, 68975-68997, 69248-69289, 69291-69292, + 69296-69297, 69314-69319, 69370-69372, 69376-69404, 69415, 69424-69445, + 69488-69505, 69552-69572, 69600-69622, 69632-69701, 69745-69749, 69760-69816, + 69826, 69840-69864, 69888-69938, 69956-69959, 69968-70002, 70006, + 70016-70079, 70081-70084, 70094-70095, 70106, 70108, 70144-70161, + 70163-70196, 70199, 70206-70209, 70272-70278, 70280, 70282-70285, + 70287-70301, 70303-70312, 70320-70376, 70400-70403, 70405-70412, 70415-70416, + 70419-70440, 70442-70448, 70450-70451, 70453-70457, 70461-70468, 70471-70472, + 70475-70476, 70480, 70487, 70493-70499, 70528-70537, 70539, 70542, + 70544-70581, 70583-70592, 70594, 70597, 70599-70602, 70604-70605, 70609, + 70611, 70656-70721, 70723-70725, 70727-70730, 70751-70753, 70784-70849, + 70852-70853, 70855, 71040-71093, 71096-71102, 71128-71133, 71168-71230, + 71232, 71236, 71296-71349, 71352, 71424-71450, 71453-71466, 71488-71494, + 71680-71736, 71840-71903, 71935-71942, 71945, 71948-71955, 71957-71958, + 71960-71989, 71991-71992, 71995-71996, 71999-72002, 72096-72103, 72106-72151, + 72154-72159, 72161, 72163-72164, 72192-72242, 72245-72254, 72272-72343, + 72349, 72368-72440, 72544-72551, 72640-72672, 72704-72712, 72714-72758, + 72760-72766, 72768, 72818-72847, 72850-72871, 72873-72886, 72960-72966, + 72968-72969, 72971-73014, 73018, 73020-73021, 73023-73025, 73027, + 73030-73031, 73056-73061, 73063-73064, 73066-73102, 73104-73105, 73107-73110, + 73112, 73136-73179, 73440-73462, 73472-73488, 73490-73530, 73534-73536, + 73648, 73728-74649, 74752-74862, 74880-75075, 77712-77808, 77824-78895, + 78913-78918, 78944-82938, 82944-83526, 90368-90414, 92160-92728, 92736-92766, + 92784-92862, 92880-92909, 92928-92975, 92992-92995, 93027-93047, 93053-93071, + 93504-93548, 93760-93823, 93856-93880, 93883-93907, 93952-94026, 94031-94087, + 94095-94111, 94176-94177, 94179, 94192-94198, 94208-101589, 101631-101662, + 101760-101874, 110576-110579, 110581-110587, 110589-110590, 110592-110882, + 110898, 110928-110930, 110933, 110948-110951, 110960-111355, 113664-113770, + 113776-113788, 113792-113800, 113808-113817, 113822, 119808-119892, + 119894-119964, 119966-119967, 119970, 119973-119974, 119977-119980, + 119982-119993, 119995, 119997-120003, 120005-120069, 120071-120074, + 120077-120084, 120086-120092, 120094-120121, 120123-120126, 120128-120132, + 120134, 120138-120144, 120146-120485, 120488-120512, 120514-120538, + 120540-120570, 120572-120596, 120598-120628, 120630-120654, 120656-120686, + 120688-120712, 120714-120744, 120746-120770, 120772-120779, 122624-122654, + 122661-122666, 122880-122886, 122888-122904, 122907-122913, 122915-122916, + 122918-122922, 122928-122989, 123023, 123136-123180, 123191-123197, 123214, + 123536-123565, 123584-123627, 124112-124139, 124368-124397, 124400, + 124608-124638, 124640-124661, 124670-124671, 124896-124902, 124904-124907, + 124909-124910, 124912-124926, 124928-125124, 125184-125251, 125255, 125259, + 126464-126467, 126469-126495, 126497-126498, 126500, 126503, 126505-126514, + 126516-126519, 126521, 126523, 126530, 126535, 126537, 126539, 126541-126543, + 126545-126546, 126548, 126551, 126553, 126555, 126557, 126559, 126561-126562, + 126564, 126567-126570, 126572-126578, 126580-126583, 126585-126588, 126590, + 126592-126601, 126603-126619, 126625-126627, 126629-126633, 126635-126651, + 127280-127305, 127312-127337, 127344-127369, 131072-173791, 173824-178205, + 178208-183981, 183984-191456, 191472-192093, 194560-195101, 196608-201546, + 201552-210041 + |}] +;; + +let%expect_test "alnum" = + Format.printf "%a@." Cset.pp Cset.alnum; + [%expect {| + 48-57, 65-90, 97-122, 170, 178-179, 181, 185-186, 188-190, 192-214, 216-246, + 248-705, 710-721, 736-740, 748, 750, 837, 867-884, 886-887, 890-893, 895, + 902, 904-906, 908, 910-929, 931-1013, 1015-1153, 1162-1327, 1329-1366, 1369, + 1376-1416, 1456-1469, 1471, 1473-1474, 1476-1477, 1479, 1488-1514, 1519-1522, + 1552-1562, 1568-1623, 1625-1641, 1646-1747, 1749-1756, 1761-1768, 1773-1788, + 1791, 1808-1855, 1869-1969, 1984-2026, 2036-2037, 2042, 2048-2071, 2074-2092, + 2112-2136, 2144-2154, 2160-2183, 2185-2191, 2199, 2208-2249, 2260-2271, + 2275-2281, 2288-2363, 2365-2380, 2382-2384, 2389-2403, 2406-2415, 2417-2435, + 2437-2444, 2447-2448, 2451-2472, 2474-2480, 2482, 2486-2489, 2493-2500, + 2503-2504, 2507-2508, 2510, 2519, 2524-2525, 2527-2531, 2534-2545, 2548-2553, + 2556, 2561-2563, 2565-2570, 2575-2576, 2579-2600, 2602-2608, 2610-2611, + 2613-2614, 2616-2617, 2622-2626, 2631-2632, 2635-2636, 2641, 2649-2652, 2654, + 2662-2677, 2689-2691, 2693-2701, 2703-2705, 2707-2728, 2730-2736, 2738-2739, + 2741-2745, 2749-2757, 2759-2761, 2763-2764, 2768, 2784-2787, 2790-2799, + 2809-2812, 2817-2819, 2821-2828, 2831-2832, 2835-2856, 2858-2864, 2866-2867, + 2869-2873, 2877-2884, 2887-2888, 2891-2892, 2902-2903, 2908-2909, 2911-2915, + 2918-2927, 2929-2935, 2946-2947, 2949-2954, 2958-2960, 2962-2965, 2969-2970, + 2972, 2974-2975, 2979-2980, 2984-2986, 2990-3001, 3006-3010, 3014-3016, + 3018-3020, 3024, 3031, 3046-3058, 3072-3084, 3086-3088, 3090-3112, 3114-3129, + 3133-3140, 3142-3144, 3146-3148, 3157-3158, 3160-3162, 3164-3165, 3168-3171, + 3174-3183, 3192-3198, 3200-3203, 3205-3212, 3214-3216, 3218-3240, 3242-3251, + 3253-3257, 3261-3268, 3270-3272, 3274-3276, 3285-3286, 3292-3294, 3296-3299, + 3302-3311, 3313-3315, 3328-3340, 3342-3344, 3346-3386, 3389-3396, 3398-3400, + 3402-3404, 3406, 3412-3427, 3430-3448, 3450-3455, 3457-3459, 3461-3478, + 3482-3505, 3507-3515, 3517, 3520-3526, 3535-3540, 3542, 3544-3551, 3558-3567, + 3570-3571, 3585-3642, 3648-3654, 3661, 3664-3673, 3713-3714, 3716, 3718-3722, + 3724-3747, 3749, 3751-3769, 3771-3773, 3776-3780, 3782, 3789, 3792-3801, + 3804-3807, 3840, 3872-3891, 3904-3911, 3913-3948, 3953-3971, 3976-3991, + 3993-4028, 4096-4150, 4152, 4155-4169, 4176-4253, 4256-4293, 4295, 4301, + 4304-4346, 4348-4680, 4682-4685, 4688-4694, 4696, 4698-4701, 4704-4744, + 4746-4749, 4752-4784, 4786-4789, 4792-4798, 4800, 4802-4805, 4808-4822, + 4824-4880, 4882-4885, 4888-4954, 4969-4988, 4992-5007, 5024-5109, 5112-5117, + 5121-5740, 5743-5759, 5761-5786, 5792-5866, 5870-5880, 5888-5907, 5919-5939, + 5952-5971, 5984-5996, 5998-6000, 6002-6003, 6016-6067, 6070-6088, 6103, 6108, + 6112-6121, 6128-6137, 6160-6169, 6176-6264, 6272-6314, 6320-6389, 6400-6430, + 6432-6443, 6448-6456, 6470-6509, 6512-6516, 6528-6571, 6576-6601, 6608-6618, + 6656-6683, 6688-6750, 6753-6772, 6784-6793, 6800-6809, 6823, 6847-6848, + 6860-6862, 6912-6963, 6965-6979, 6981-6988, 6992-7001, 7040-7081, 7084-7141, + 7143-7153, 7168-7222, 7232-7241, 7245-7293, 7296-7306, 7312-7354, 7357-7359, + 7401-7404, 7406-7411, 7413-7414, 7418, 7424-7615, 7635-7668, 7680-7957, + 7960-7965, 7968-8005, 8008-8013, 8016-8023, 8025, 8027, 8029, 8031-8061, + 8064-8116, 8118-8124, 8126, 8130-8132, 8134-8140, 8144-8147, 8150-8155, + 8160-8172, 8178-8180, 8182-8188, 8304-8305, 8308-8313, 8319-8329, 8336-8348, + 8450, 8455, 8458-8467, 8469, 8473-8477, 8484, 8486, 8488, 8490-8493, + 8495-8505, 8508-8511, 8517-8521, 8526, 8528-8585, 9312-9371, 9398-9471, + 10102-10131, 11264-11492, 11499-11502, 11506-11507, 11517, 11520-11557, + 11559, 11565, 11568-11623, 11631, 11648-11670, 11680-11686, 11688-11694, + 11696-11702, 11704-11710, 11712-11718, 11720-11726, 11728-11734, 11736-11742, + 11744-11775, 11823, 12293-12295, 12321-12329, 12337-12341, 12344-12348, + 12353-12438, 12445-12447, 12449-12538, 12540-12543, 12549-12591, 12593-12686, + 12690-12693, 12704-12735, 12784-12799, 12832-12841, 12872-12879, 12881-12895, + 12928-12937, 12977-12991, 13312-19903, 19968-42124, 42192-42237, 42240-42508, + 42512-42539, 42560-42606, 42612-42619, 42623-42735, 42775-42783, 42786-42888, + 42891-42972, 42993-43013, 43015-43047, 43056-43061, 43072-43123, 43136-43203, + 43205, 43216-43225, 43250-43255, 43259, 43261-43306, 43312-43346, + 43360-43388, 43392-43442, 43444-43455, 43471-43481, 43488-43518, 43520-43574, + 43584-43597, 43600-43609, 43616-43638, 43642-43710, 43712, 43714, + 43739-43741, 43744-43759, 43762-43765, 43777-43782, 43785-43790, 43793-43798, + 43808-43814, 43816-43822, 43824-43866, 43868-43881, 43888-44010, 44016-44025, + 44032-55203, 55216-55238, 55243-55291, 63744-64109, 64112-64217, 64256-64262, + 64275-64279, 64285-64296, 64298-64310, 64312-64316, 64318, 64320-64321, + 64323-64324, 64326-64433, 64467-64829, 64848-64911, 64914-64967, 65008-65019, + 65136-65140, 65142-65276, 65296-65305, 65313-65338, 65345-65370, 65382-65470, + 65474-65479, 65482-65487, 65490-65495, 65498-65500, 65536-65547, 65549-65574, + 65576-65594, 65596-65597, 65599-65613, 65616-65629, 65664-65786, 65799-65843, + 65856-65912, 65930-65931, 66176-66204, 66208-66256, 66273-66299, 66304-66339, + 66349-66378, 66384-66426, 66432-66461, 66464-66499, 66504-66511, 66513-66517, + 66560-66717, 66720-66729, 66736-66771, 66776-66811, 66816-66855, 66864-66915, + 66928-66938, 66940-66954, 66956-66962, 66964-66965, 66967-66977, 66979-66993, + 66995-67001, 67003-67004, 67008-67059, 67072-67382, 67392-67413, 67424-67431, + 67456-67461, 67463-67504, 67506-67514, 67584-67589, 67592, 67594-67637, + 67639-67640, 67644, 67647-67669, 67672-67702, 67705-67742, 67751-67759, + 67808-67826, 67828-67829, 67835-67867, 67872-67897, 67904-67929, 67968-68023, + 68028-68047, 68050-68099, 68101-68102, 68108-68115, 68117-68119, 68121-68149, + 68160-68168, 68192-68222, 68224-68255, 68288-68295, 68297-68324, 68331-68335, + 68352-68405, 68416-68437, 68440-68466, 68472-68497, 68521-68527, 68608-68680, + 68736-68786, 68800-68850, 68858-68903, 68912-68921, 68928-68965, 68969, + 68975-68997, 69216-69246, 69248-69289, 69291-69292, 69296-69297, 69314-69319, + 69370-69372, 69376-69415, 69424-69445, 69457-69460, 69488-69505, 69552-69579, + 69600-69622, 69632-69701, 69714-69743, 69745-69749, 69760-69816, 69826, + 69840-69864, 69872-69881, 69888-69938, 69942-69951, 69956-69959, 69968-70002, + 70006, 70016-70079, 70081-70084, 70094-70106, 70108, 70113-70132, + 70144-70161, 70163-70196, 70199, 70206-70209, 70272-70278, 70280, + 70282-70285, 70287-70301, 70303-70312, 70320-70376, 70384-70393, 70400-70403, + 70405-70412, 70415-70416, 70419-70440, 70442-70448, 70450-70451, 70453-70457, + 70461-70468, 70471-70472, 70475-70476, 70480, 70487, 70493-70499, + 70528-70537, 70539, 70542, 70544-70581, 70583-70592, 70594, 70597, + 70599-70602, 70604-70605, 70609, 70611, 70656-70721, 70723-70725, + 70727-70730, 70736-70745, 70751-70753, 70784-70849, 70852-70853, 70855, + 70864-70873, 71040-71093, 71096-71102, 71128-71133, 71168-71230, 71232, + 71236, 71248-71257, 71296-71349, 71352, 71360-71369, 71376-71395, + 71424-71450, 71453-71466, 71472-71483, 71488-71494, 71680-71736, 71840-71922, + 71935-71942, 71945, 71948-71955, 71957-71958, 71960-71989, 71991-71992, + 71995-71996, 71999-72002, 72016-72025, 72096-72103, 72106-72151, 72154-72159, + 72161, 72163-72164, 72192-72242, 72245-72254, 72272-72343, 72349, + 72368-72440, 72544-72551, 72640-72672, 72688-72697, 72704-72712, 72714-72758, + 72760-72766, 72768, 72784-72812, 72818-72847, 72850-72871, 72873-72886, + 72960-72966, 72968-72969, 72971-73014, 73018, 73020-73021, 73023-73025, + 73027, 73030-73031, 73040-73049, 73056-73061, 73063-73064, 73066-73102, + 73104-73105, 73107-73110, 73112, 73120-73129, 73136-73179, 73184-73193, + 73440-73462, 73472-73488, 73490-73530, 73534-73536, 73552-73561, 73648, + 73664-73684, 73728-74649, 74752-74862, 74880-75075, 77712-77808, 77824-78895, + 78913-78918, 78944-82938, 82944-83526, 90368-90414, 90416-90425, 92160-92728, + 92736-92766, 92768-92777, 92784-92862, 92864-92873, 92880-92909, 92928-92975, + 92992-92995, 93008-93017, 93019-93025, 93027-93047, 93053-93071, 93504-93548, + 93552-93561, 93760-93846, 93856-93880, 93883-93907, 93952-94026, 94031-94087, + 94095-94111, 94176-94177, 94179, 94192-94198, 94208-101589, 101631-101662, + 101760-101874, 110576-110579, 110581-110587, 110589-110590, 110592-110882, + 110898, 110928-110930, 110933, 110948-110951, 110960-111355, 113664-113770, + 113776-113788, 113792-113800, 113808-113817, 113822, 118000-118009, + 119488-119507, 119520-119539, 119648-119672, 119808-119892, 119894-119964, + 119966-119967, 119970, 119973-119974, 119977-119980, 119982-119993, 119995, + 119997-120003, 120005-120069, 120071-120074, 120077-120084, 120086-120092, + 120094-120121, 120123-120126, 120128-120132, 120134, 120138-120144, + 120146-120485, 120488-120512, 120514-120538, 120540-120570, 120572-120596, + 120598-120628, 120630-120654, 120656-120686, 120688-120712, 120714-120744, + 120746-120770, 120772-120779, 120782-120831, 122624-122654, 122661-122666, + 122880-122886, 122888-122904, 122907-122913, 122915-122916, 122918-122922, + 122928-122989, 123023, 123136-123180, 123191-123197, 123200-123209, 123214, + 123536-123565, 123584-123627, 123632-123641, 124112-124139, 124144-124153, + 124368-124397, 124400-124410, 124608-124638, 124640-124661, 124670-124671, + 124896-124902, 124904-124907, 124909-124910, 124912-124926, 124928-125124, + 125127-125135, 125184-125251, 125255, 125259, 125264-125273, 126065-126123, + 126125-126127, 126129-126132, 126209-126253, 126255-126269, 126464-126467, + 126469-126495, 126497-126498, 126500, 126503, 126505-126514, 126516-126519, + 126521, 126523, 126530, 126535, 126537, 126539, 126541-126543, 126545-126546, + 126548, 126551, 126553, 126555, 126557, 126559, 126561-126562, 126564, + 126567-126570, 126572-126578, 126580-126583, 126585-126588, 126590, + 126592-126601, 126603-126619, 126625-126627, 126629-126633, 126635-126651, + 127232-127244, 127280-127305, 127312-127337, 127344-127369, 130032-130041, + 131072-173791, 173824-178205, 178208-183981, 183984-191456, 191472-192093, + 194560-195101, 196608-201546, + 201552-210041 + |}] +;; + +let%expect_test "wordc" = + Format.printf "%a@." Cset.pp Cset.wordc; + [%expect {| + 48-57, 65-90, 95, 97-122, 170, 178-179, 181, 185-186, 188-190, 192-214, + 216-246, 248-705, 710-721, 736-740, 748, 750, 837, 867-884, 886-887, 890-893, + 895, 902, 904-906, 908, 910-929, 931-1013, 1015-1153, 1162-1327, 1329-1366, + 1369, 1376-1416, 1456-1469, 1471, 1473-1474, 1476-1477, 1479, 1488-1514, + 1519-1522, 1552-1562, 1568-1623, 1625-1641, 1646-1747, 1749-1756, 1761-1768, + 1773-1788, 1791, 1808-1855, 1869-1969, 1984-2026, 2036-2037, 2042, 2048-2071, + 2074-2092, 2112-2136, 2144-2154, 2160-2183, 2185-2191, 2199, 2208-2249, + 2260-2271, 2275-2281, 2288-2363, 2365-2380, 2382-2384, 2389-2403, 2406-2415, + 2417-2435, 2437-2444, 2447-2448, 2451-2472, 2474-2480, 2482, 2486-2489, + 2493-2500, 2503-2504, 2507-2508, 2510, 2519, 2524-2525, 2527-2531, 2534-2545, + 2548-2553, 2556, 2561-2563, 2565-2570, 2575-2576, 2579-2600, 2602-2608, + 2610-2611, 2613-2614, 2616-2617, 2622-2626, 2631-2632, 2635-2636, 2641, + 2649-2652, 2654, 2662-2677, 2689-2691, 2693-2701, 2703-2705, 2707-2728, + 2730-2736, 2738-2739, 2741-2745, 2749-2757, 2759-2761, 2763-2764, 2768, + 2784-2787, 2790-2799, 2809-2812, 2817-2819, 2821-2828, 2831-2832, 2835-2856, + 2858-2864, 2866-2867, 2869-2873, 2877-2884, 2887-2888, 2891-2892, 2902-2903, + 2908-2909, 2911-2915, 2918-2927, 2929-2935, 2946-2947, 2949-2954, 2958-2960, + 2962-2965, 2969-2970, 2972, 2974-2975, 2979-2980, 2984-2986, 2990-3001, + 3006-3010, 3014-3016, 3018-3020, 3024, 3031, 3046-3058, 3072-3084, 3086-3088, + 3090-3112, 3114-3129, 3133-3140, 3142-3144, 3146-3148, 3157-3158, 3160-3162, + 3164-3165, 3168-3171, 3174-3183, 3192-3198, 3200-3203, 3205-3212, 3214-3216, + 3218-3240, 3242-3251, 3253-3257, 3261-3268, 3270-3272, 3274-3276, 3285-3286, + 3292-3294, 3296-3299, 3302-3311, 3313-3315, 3328-3340, 3342-3344, 3346-3386, + 3389-3396, 3398-3400, 3402-3404, 3406, 3412-3427, 3430-3448, 3450-3455, + 3457-3459, 3461-3478, 3482-3505, 3507-3515, 3517, 3520-3526, 3535-3540, 3542, + 3544-3551, 3558-3567, 3570-3571, 3585-3642, 3648-3654, 3661, 3664-3673, + 3713-3714, 3716, 3718-3722, 3724-3747, 3749, 3751-3769, 3771-3773, 3776-3780, + 3782, 3789, 3792-3801, 3804-3807, 3840, 3872-3891, 3904-3911, 3913-3948, + 3953-3971, 3976-3991, 3993-4028, 4096-4150, 4152, 4155-4169, 4176-4253, + 4256-4293, 4295, 4301, 4304-4346, 4348-4680, 4682-4685, 4688-4694, 4696, + 4698-4701, 4704-4744, 4746-4749, 4752-4784, 4786-4789, 4792-4798, 4800, + 4802-4805, 4808-4822, 4824-4880, 4882-4885, 4888-4954, 4969-4988, 4992-5007, + 5024-5109, 5112-5117, 5121-5740, 5743-5759, 5761-5786, 5792-5866, 5870-5880, + 5888-5907, 5919-5939, 5952-5971, 5984-5996, 5998-6000, 6002-6003, 6016-6067, + 6070-6088, 6103, 6108, 6112-6121, 6128-6137, 6160-6169, 6176-6264, 6272-6314, + 6320-6389, 6400-6430, 6432-6443, 6448-6456, 6470-6509, 6512-6516, 6528-6571, + 6576-6601, 6608-6618, 6656-6683, 6688-6750, 6753-6772, 6784-6793, 6800-6809, + 6823, 6847-6848, 6860-6862, 6912-6963, 6965-6979, 6981-6988, 6992-7001, + 7040-7081, 7084-7141, 7143-7153, 7168-7222, 7232-7241, 7245-7293, 7296-7306, + 7312-7354, 7357-7359, 7401-7404, 7406-7411, 7413-7414, 7418, 7424-7615, + 7635-7668, 7680-7957, 7960-7965, 7968-8005, 8008-8013, 8016-8023, 8025, 8027, + 8029, 8031-8061, 8064-8116, 8118-8124, 8126, 8130-8132, 8134-8140, 8144-8147, + 8150-8155, 8160-8172, 8178-8180, 8182-8188, 8304-8305, 8308-8313, 8319-8329, + 8336-8348, 8450, 8455, 8458-8467, 8469, 8473-8477, 8484, 8486, 8488, + 8490-8493, 8495-8505, 8508-8511, 8517-8521, 8526, 8528-8585, 9312-9371, + 9398-9471, 10102-10131, 11264-11492, 11499-11502, 11506-11507, 11517, + 11520-11557, 11559, 11565, 11568-11623, 11631, 11648-11670, 11680-11686, + 11688-11694, 11696-11702, 11704-11710, 11712-11718, 11720-11726, 11728-11734, + 11736-11742, 11744-11775, 11823, 12293-12295, 12321-12329, 12337-12341, + 12344-12348, 12353-12438, 12445-12447, 12449-12538, 12540-12543, 12549-12591, + 12593-12686, 12690-12693, 12704-12735, 12784-12799, 12832-12841, 12872-12879, + 12881-12895, 12928-12937, 12977-12991, 13312-19903, 19968-42124, 42192-42237, + 42240-42508, 42512-42539, 42560-42606, 42612-42619, 42623-42735, 42775-42783, + 42786-42888, 42891-42972, 42993-43013, 43015-43047, 43056-43061, 43072-43123, + 43136-43203, 43205, 43216-43225, 43250-43255, 43259, 43261-43306, + 43312-43346, 43360-43388, 43392-43442, 43444-43455, 43471-43481, 43488-43518, + 43520-43574, 43584-43597, 43600-43609, 43616-43638, 43642-43710, 43712, + 43714, 43739-43741, 43744-43759, 43762-43765, 43777-43782, 43785-43790, + 43793-43798, 43808-43814, 43816-43822, 43824-43866, 43868-43881, 43888-44010, + 44016-44025, 44032-55203, 55216-55238, 55243-55291, 63744-64109, 64112-64217, + 64256-64262, 64275-64279, 64285-64296, 64298-64310, 64312-64316, 64318, + 64320-64321, 64323-64324, 64326-64433, 64467-64829, 64848-64911, 64914-64967, + 65008-65019, 65136-65140, 65142-65276, 65296-65305, 65313-65338, 65345-65370, + 65382-65470, 65474-65479, 65482-65487, 65490-65495, 65498-65500, 65536-65547, + 65549-65574, 65576-65594, 65596-65597, 65599-65613, 65616-65629, 65664-65786, + 65799-65843, 65856-65912, 65930-65931, 66176-66204, 66208-66256, 66273-66299, + 66304-66339, 66349-66378, 66384-66426, 66432-66461, 66464-66499, 66504-66511, + 66513-66517, 66560-66717, 66720-66729, 66736-66771, 66776-66811, 66816-66855, + 66864-66915, 66928-66938, 66940-66954, 66956-66962, 66964-66965, 66967-66977, + 66979-66993, 66995-67001, 67003-67004, 67008-67059, 67072-67382, 67392-67413, + 67424-67431, 67456-67461, 67463-67504, 67506-67514, 67584-67589, 67592, + 67594-67637, 67639-67640, 67644, 67647-67669, 67672-67702, 67705-67742, + 67751-67759, 67808-67826, 67828-67829, 67835-67867, 67872-67897, 67904-67929, + 67968-68023, 68028-68047, 68050-68099, 68101-68102, 68108-68115, 68117-68119, + 68121-68149, 68160-68168, 68192-68222, 68224-68255, 68288-68295, 68297-68324, + 68331-68335, 68352-68405, 68416-68437, 68440-68466, 68472-68497, 68521-68527, + 68608-68680, 68736-68786, 68800-68850, 68858-68903, 68912-68921, 68928-68965, + 68969, 68975-68997, 69216-69246, 69248-69289, 69291-69292, 69296-69297, + 69314-69319, 69370-69372, 69376-69415, 69424-69445, 69457-69460, 69488-69505, + 69552-69579, 69600-69622, 69632-69701, 69714-69743, 69745-69749, 69760-69816, + 69826, 69840-69864, 69872-69881, 69888-69938, 69942-69951, 69956-69959, + 69968-70002, 70006, 70016-70079, 70081-70084, 70094-70106, 70108, + 70113-70132, 70144-70161, 70163-70196, 70199, 70206-70209, 70272-70278, + 70280, 70282-70285, 70287-70301, 70303-70312, 70320-70376, 70384-70393, + 70400-70403, 70405-70412, 70415-70416, 70419-70440, 70442-70448, 70450-70451, + 70453-70457, 70461-70468, 70471-70472, 70475-70476, 70480, 70487, + 70493-70499, 70528-70537, 70539, 70542, 70544-70581, 70583-70592, 70594, + 70597, 70599-70602, 70604-70605, 70609, 70611, 70656-70721, 70723-70725, + 70727-70730, 70736-70745, 70751-70753, 70784-70849, 70852-70853, 70855, + 70864-70873, 71040-71093, 71096-71102, 71128-71133, 71168-71230, 71232, + 71236, 71248-71257, 71296-71349, 71352, 71360-71369, 71376-71395, + 71424-71450, 71453-71466, 71472-71483, 71488-71494, 71680-71736, 71840-71922, + 71935-71942, 71945, 71948-71955, 71957-71958, 71960-71989, 71991-71992, + 71995-71996, 71999-72002, 72016-72025, 72096-72103, 72106-72151, 72154-72159, + 72161, 72163-72164, 72192-72242, 72245-72254, 72272-72343, 72349, + 72368-72440, 72544-72551, 72640-72672, 72688-72697, 72704-72712, 72714-72758, + 72760-72766, 72768, 72784-72812, 72818-72847, 72850-72871, 72873-72886, + 72960-72966, 72968-72969, 72971-73014, 73018, 73020-73021, 73023-73025, + 73027, 73030-73031, 73040-73049, 73056-73061, 73063-73064, 73066-73102, + 73104-73105, 73107-73110, 73112, 73120-73129, 73136-73179, 73184-73193, + 73440-73462, 73472-73488, 73490-73530, 73534-73536, 73552-73561, 73648, + 73664-73684, 73728-74649, 74752-74862, 74880-75075, 77712-77808, 77824-78895, + 78913-78918, 78944-82938, 82944-83526, 90368-90414, 90416-90425, 92160-92728, + 92736-92766, 92768-92777, 92784-92862, 92864-92873, 92880-92909, 92928-92975, + 92992-92995, 93008-93017, 93019-93025, 93027-93047, 93053-93071, 93504-93548, + 93552-93561, 93760-93846, 93856-93880, 93883-93907, 93952-94026, 94031-94087, + 94095-94111, 94176-94177, 94179, 94192-94198, 94208-101589, 101631-101662, + 101760-101874, 110576-110579, 110581-110587, 110589-110590, 110592-110882, + 110898, 110928-110930, 110933, 110948-110951, 110960-111355, 113664-113770, + 113776-113788, 113792-113800, 113808-113817, 113822, 118000-118009, + 119488-119507, 119520-119539, 119648-119672, 119808-119892, 119894-119964, + 119966-119967, 119970, 119973-119974, 119977-119980, 119982-119993, 119995, + 119997-120003, 120005-120069, 120071-120074, 120077-120084, 120086-120092, + 120094-120121, 120123-120126, 120128-120132, 120134, 120138-120144, + 120146-120485, 120488-120512, 120514-120538, 120540-120570, 120572-120596, + 120598-120628, 120630-120654, 120656-120686, 120688-120712, 120714-120744, + 120746-120770, 120772-120779, 120782-120831, 122624-122654, 122661-122666, + 122880-122886, 122888-122904, 122907-122913, 122915-122916, 122918-122922, + 122928-122989, 123023, 123136-123180, 123191-123197, 123200-123209, 123214, + 123536-123565, 123584-123627, 123632-123641, 124112-124139, 124144-124153, + 124368-124397, 124400-124410, 124608-124638, 124640-124661, 124670-124671, + 124896-124902, 124904-124907, 124909-124910, 124912-124926, 124928-125124, + 125127-125135, 125184-125251, 125255, 125259, 125264-125273, 126065-126123, + 126125-126127, 126129-126132, 126209-126253, 126255-126269, 126464-126467, + 126469-126495, 126497-126498, 126500, 126503, 126505-126514, 126516-126519, + 126521, 126523, 126530, 126535, 126537, 126539, 126541-126543, 126545-126546, + 126548, 126551, 126553, 126555, 126557, 126559, 126561-126562, 126564, + 126567-126570, 126572-126578, 126580-126583, 126585-126588, 126590, + 126592-126601, 126603-126619, 126625-126627, 126629-126633, 126635-126651, + 127232-127244, 127280-127305, 127312-127337, 127344-127369, 130032-130041, + 131072-173791, 173824-178205, 178208-183981, 183984-191456, 191472-192093, + 194560-195101, 196608-201546, + 201552-210041 + |}] +;; + +let%expect_test "cntrl" = + Format.printf "%a@." Cset.pp Cset.cntrl; + [%expect {| + 0-31, + 127-159 + |}] +;; + +let%expect_test "graph" = + Format.printf "%a@." Cset.pp Cset.graph; + [%expect {| + 32-126, 160-172, 174-887, 890-895, 900-906, 908, 910-929, 931-1327, + 1329-1366, 1369-1418, 1421-1423, 1425-1479, 1488-1514, 1519-1524, 1542-1563, + 1565-1756, 1758-1805, 1808-1866, 1869-1969, 1984-2042, 2045-2093, 2096-2110, + 2112-2139, 2142, 2144-2154, 2160-2191, 2199-2273, 2275-2435, 2437-2444, + 2447-2448, 2451-2472, 2474-2480, 2482, 2486-2489, 2492-2500, 2503-2504, + 2507-2510, 2519, 2524-2525, 2527-2531, 2534-2558, 2561-2563, 2565-2570, + 2575-2576, 2579-2600, 2602-2608, 2610-2611, 2613-2614, 2616-2617, 2620, + 2622-2626, 2631-2632, 2635-2637, 2641, 2649-2652, 2654, 2662-2678, 2689-2691, + 2693-2701, 2703-2705, 2707-2728, 2730-2736, 2738-2739, 2741-2745, 2748-2757, + 2759-2761, 2763-2765, 2768, 2784-2787, 2790-2801, 2809-2815, 2817-2819, + 2821-2828, 2831-2832, 2835-2856, 2858-2864, 2866-2867, 2869-2873, 2876-2884, + 2887-2888, 2891-2893, 2901-2903, 2908-2909, 2911-2915, 2918-2935, 2946-2947, + 2949-2954, 2958-2960, 2962-2965, 2969-2970, 2972, 2974-2975, 2979-2980, + 2984-2986, 2990-3001, 3006-3010, 3014-3016, 3018-3021, 3024, 3031, 3046-3066, + 3072-3084, 3086-3088, 3090-3112, 3114-3129, 3132-3140, 3142-3144, 3146-3149, + 3157-3158, 3160-3162, 3164-3165, 3168-3171, 3174-3183, 3191-3212, 3214-3216, + 3218-3240, 3242-3251, 3253-3257, 3260-3268, 3270-3272, 3274-3277, 3285-3286, + 3292-3294, 3296-3299, 3302-3311, 3313-3315, 3328-3340, 3342-3344, 3346-3396, + 3398-3400, 3402-3407, 3412-3427, 3430-3455, 3457-3459, 3461-3478, 3482-3505, + 3507-3515, 3517, 3520-3526, 3530, 3535-3540, 3542, 3544-3551, 3558-3567, + 3570-3572, 3585-3642, 3647-3675, 3713-3714, 3716, 3718-3722, 3724-3747, 3749, + 3751-3773, 3776-3780, 3782, 3784-3790, 3792-3801, 3804-3807, 3840-3911, + 3913-3948, 3953-3991, 3993-4028, 4030-4044, 4046-4058, 4096-4293, 4295, 4301, + 4304-4680, 4682-4685, 4688-4694, 4696, 4698-4701, 4704-4744, 4746-4749, + 4752-4784, 4786-4789, 4792-4798, 4800, 4802-4805, 4808-4822, 4824-4880, + 4882-4885, 4888-4954, 4957-4988, 4992-5017, 5024-5109, 5112-5117, 5120-5788, + 5792-5880, 5888-5909, 5919-5942, 5952-5971, 5984-5996, 5998-6000, 6002-6003, + 6016-6109, 6112-6121, 6128-6137, 6144-6157, 6159-6169, 6176-6264, 6272-6314, + 6320-6389, 6400-6430, 6432-6443, 6448-6459, 6464, 6468-6509, 6512-6516, + 6528-6571, 6576-6601, 6608-6618, 6622-6683, 6686-6750, 6752-6780, 6783-6793, + 6800-6809, 6816-6829, 6832-6877, 6880-6891, 6912-6988, 6990-7155, 7164-7223, + 7227-7241, 7245-7306, 7312-7354, 7357-7367, 7376-7418, 7424-7957, 7960-7965, + 7968-8005, 8008-8013, 8016-8023, 8025, 8027, 8029, 8031-8061, 8064-8116, + 8118-8132, 8134-8147, 8150-8155, 8157-8175, 8178-8180, 8182-8190, 8192-8202, + 8208-8231, 8239-8287, 8304-8305, 8308-8334, 8336-8348, 8352-8385, 8400-8432, + 8448-8587, 8592-9257, 9280-9290, 9312-11123, 11126-11507, 11513-11557, 11559, + 11565, 11568-11623, 11631-11632, 11647-11670, 11680-11686, 11688-11694, + 11696-11702, 11704-11710, 11712-11718, 11720-11726, 11728-11734, 11736-11742, + 11744-11869, 11904-11929, 11931-12019, 12032-12245, 12272-12351, 12353-12438, + 12441-12543, 12549-12591, 12593-12686, 12688-12773, 12783-12830, 12832-42124, + 42128-42182, 42192-42539, 42560-42743, 42752-42972, 42993-43052, 43056-43065, + 43072-43127, 43136-43205, 43214-43225, 43232-43347, 43359-43388, 43392-43469, + 43471-43481, 43486-43518, 43520-43574, 43584-43597, 43600-43609, 43612-43714, + 43739-43766, 43777-43782, 43785-43790, 43793-43798, 43808-43814, 43816-43822, + 43824-43883, 43888-44013, 44016-44025, 44032-55203, 55216-55238, 55243-55291, + 63744-64109, 64112-64217, 64256-64262, 64275-64279, 64285-64310, 64312-64316, + 64318, 64320-64321, 64323-64324, 64326-64975, 65008-65049, 65056-65106, + 65108-65126, 65128-65131, 65136-65140, 65142-65276, 65281-65470, 65474-65479, + 65482-65487, 65490-65495, 65498-65500, 65504-65510, 65512-65518, 65532-65533, + 65536-65547, 65549-65574, 65576-65594, 65596-65597, 65599-65613, 65616-65629, + 65664-65786, 65792-65794, 65799-65843, 65847-65934, 65936-65948, 65952, + 66000-66045, 66176-66204, 66208-66256, 66272-66299, 66304-66339, 66349-66378, + 66384-66426, 66432-66461, 66463-66499, 66504-66517, 66560-66717, 66720-66729, + 66736-66771, 66776-66811, 66816-66855, 66864-66915, 66927-66938, 66940-66954, + 66956-66962, 66964-66965, 66967-66977, 66979-66993, 66995-67001, 67003-67004, + 67008-67059, 67072-67382, 67392-67413, 67424-67431, 67456-67461, 67463-67504, + 67506-67514, 67584-67589, 67592, 67594-67637, 67639-67640, 67644, + 67647-67669, 67671-67742, 67751-67759, 67808-67826, 67828-67829, 67835-67867, + 67871-67897, 67903-67929, 67968-68023, 68028-68047, 68050-68099, 68101-68102, + 68108-68115, 68117-68119, 68121-68149, 68152-68154, 68159-68168, 68176-68184, + 68192-68255, 68288-68326, 68331-68342, 68352-68405, 68409-68437, 68440-68466, + 68472-68497, 68505-68508, 68521-68527, 68608-68680, 68736-68786, 68800-68850, + 68858-68903, 68912-68921, 68928-68965, 68969-68997, 69006-69007, 69216-69246, + 69248-69289, 69291-69293, 69296-69297, 69314-69319, 69328-69336, 69370-69415, + 69424-69465, 69488-69513, 69552-69579, 69600-69622, 69632-69709, 69714-69749, + 69759-69820, 69822-69826, 69840-69864, 69872-69881, 69888-69940, 69942-69959, + 69968-70006, 70016-70111, 70113-70132, 70144-70161, 70163-70209, 70272-70278, + 70280, 70282-70285, 70287-70301, 70303-70313, 70320-70378, 70384-70393, + 70400-70403, 70405-70412, 70415-70416, 70419-70440, 70442-70448, 70450-70451, + 70453-70457, 70459-70468, 70471-70472, 70475-70477, 70480, 70487, + 70493-70499, 70502-70508, 70512-70516, 70528-70537, 70539, 70542, + 70544-70581, 70583-70592, 70594, 70597, 70599-70602, 70604-70613, + 70615-70616, 70625-70626, 70656-70747, 70749-70753, 70784-70855, 70864-70873, + 71040-71093, 71096-71133, 71168-71236, 71248-71257, 71264-71276, 71296-71353, + 71360-71369, 71376-71395, 71424-71450, 71453-71467, 71472-71494, 71680-71739, + 71840-71922, 71935-71942, 71945, 71948-71955, 71957-71958, 71960-71989, + 71991-71992, 71995-72006, 72016-72025, 72096-72103, 72106-72151, 72154-72164, + 72192-72263, 72272-72354, 72368-72440, 72448-72457, 72544-72551, 72640-72673, + 72688-72697, 72704-72712, 72714-72758, 72760-72773, 72784-72812, 72816-72847, + 72850-72871, 72873-72886, 72960-72966, 72968-72969, 72971-73014, 73018, + 73020-73021, 73023-73031, 73040-73049, 73056-73061, 73063-73064, 73066-73102, + 73104-73105, 73107-73112, 73120-73129, 73136-73179, 73184-73193, 73440-73464, + 73472-73488, 73490-73530, 73534-73562, 73648, 73664-73713, 73727-74649, + 74752-74862, 74864-74868, 74880-75075, 77712-77810, 77824-78895, 78912-78933, + 78944-82938, 82944-83526, 90368-90425, 92160-92728, 92736-92766, 92768-92777, + 92782-92862, 92864-92873, 92880-92909, 92912-92917, 92928-92997, 93008-93017, + 93019-93025, 93027-93047, 93053-93071, 93504-93561, 93760-93850, 93856-93880, + 93883-93907, 93952-94026, 94031-94087, 94095-94111, 94176-94180, 94192-94198, + 94208-101589, 101631-101662, 101760-101874, 110576-110579, 110581-110587, + 110589-110590, 110592-110882, 110898, 110928-110930, 110933, 110948-110951, + 110960-111355, 113664-113770, 113776-113788, 113792-113800, 113808-113817, + 113820-113823, 117760-118012, 118016-118451, 118458-118480, 118496-118512, + 118528-118573, 118576-118598, 118608-118723, 118784-119029, 119040-119078, + 119081-119154, 119163-119274, 119296-119365, 119488-119507, 119520-119539, + 119552-119638, 119648-119672, 119808-119892, 119894-119964, 119966-119967, + 119970, 119973-119974, 119977-119980, 119982-119993, 119995, 119997-120003, + 120005-120069, 120071-120074, 120077-120084, 120086-120092, 120094-120121, + 120123-120126, 120128-120132, 120134, 120138-120144, 120146-120485, + 120488-120779, 120782-121483, 121499-121503, 121505-121519, 122624-122654, + 122661-122666, 122880-122886, 122888-122904, 122907-122913, 122915-122916, + 122918-122922, 122928-122989, 123023, 123136-123180, 123184-123197, + 123200-123209, 123214-123215, 123536-123566, 123584-123641, 123647, + 124112-124153, 124368-124410, 124415, 124608-124638, 124640-124661, + 124670-124671, 124896-124902, 124904-124907, 124909-124910, 124912-124926, + 124928-125124, 125127-125142, 125184-125259, 125264-125273, 125278-125279, + 126065-126132, 126209-126269, 126464-126467, 126469-126495, 126497-126498, + 126500, 126503, 126505-126514, 126516-126519, 126521, 126523, 126530, 126535, + 126537, 126539, 126541-126543, 126545-126546, 126548, 126551, 126553, 126555, + 126557, 126559, 126561-126562, 126564, 126567-126570, 126572-126578, + 126580-126583, 126585-126588, 126590, 126592-126601, 126603-126619, + 126625-126627, 126629-126633, 126635-126651, 126704-126705, 126976-127019, + 127024-127123, 127136-127150, 127153-127167, 127169-127183, 127185-127221, + 127232-127405, 127462-127490, 127504-127547, 127552-127560, 127568-127569, + 127584-127589, 127744-128728, 128732-128748, 128752-128764, 128768-128985, + 128992-129003, 129008, 129024-129035, 129040-129095, 129104-129113, + 129120-129159, 129168-129197, 129200-129211, 129216-129217, 129232-129240, + 129280-129623, 129632-129645, 129648-129660, 129664-129674, 129678-129734, + 129736, 129741-129756, 129759-129770, 129775-129784, 129792-129938, + 129940-130042, 131072-173791, 173824-178205, 178208-183981, 183984-191456, + 191472-192093, 194560-195101, 196608-201546, 201552-210041, + 917760-917999 + |}] +;; + +let%expect_test "print" = + Format.printf "%a@." Cset.pp Cset.print; + [%expect {| + 32-126, 161-172, 174-887, 890-895, 900-906, 908, 910-929, 931-1327, + 1329-1366, 1369-1418, 1421-1423, 1425-1479, 1488-1514, 1519-1524, 1542-1563, + 1565-1756, 1758-1805, 1808-1866, 1869-1969, 1984-2042, 2045-2093, 2096-2110, + 2112-2139, 2142, 2144-2154, 2160-2191, 2199-2273, 2275-2435, 2437-2444, + 2447-2448, 2451-2472, 2474-2480, 2482, 2486-2489, 2492-2500, 2503-2504, + 2507-2510, 2519, 2524-2525, 2527-2531, 2534-2558, 2561-2563, 2565-2570, + 2575-2576, 2579-2600, 2602-2608, 2610-2611, 2613-2614, 2616-2617, 2620, + 2622-2626, 2631-2632, 2635-2637, 2641, 2649-2652, 2654, 2662-2678, 2689-2691, + 2693-2701, 2703-2705, 2707-2728, 2730-2736, 2738-2739, 2741-2745, 2748-2757, + 2759-2761, 2763-2765, 2768, 2784-2787, 2790-2801, 2809-2815, 2817-2819, + 2821-2828, 2831-2832, 2835-2856, 2858-2864, 2866-2867, 2869-2873, 2876-2884, + 2887-2888, 2891-2893, 2901-2903, 2908-2909, 2911-2915, 2918-2935, 2946-2947, + 2949-2954, 2958-2960, 2962-2965, 2969-2970, 2972, 2974-2975, 2979-2980, + 2984-2986, 2990-3001, 3006-3010, 3014-3016, 3018-3021, 3024, 3031, 3046-3066, + 3072-3084, 3086-3088, 3090-3112, 3114-3129, 3132-3140, 3142-3144, 3146-3149, + 3157-3158, 3160-3162, 3164-3165, 3168-3171, 3174-3183, 3191-3212, 3214-3216, + 3218-3240, 3242-3251, 3253-3257, 3260-3268, 3270-3272, 3274-3277, 3285-3286, + 3292-3294, 3296-3299, 3302-3311, 3313-3315, 3328-3340, 3342-3344, 3346-3396, + 3398-3400, 3402-3407, 3412-3427, 3430-3455, 3457-3459, 3461-3478, 3482-3505, + 3507-3515, 3517, 3520-3526, 3530, 3535-3540, 3542, 3544-3551, 3558-3567, + 3570-3572, 3585-3642, 3647-3675, 3713-3714, 3716, 3718-3722, 3724-3747, 3749, + 3751-3773, 3776-3780, 3782, 3784-3790, 3792-3801, 3804-3807, 3840-3911, + 3913-3948, 3953-3991, 3993-4028, 4030-4044, 4046-4058, 4096-4293, 4295, 4301, + 4304-4680, 4682-4685, 4688-4694, 4696, 4698-4701, 4704-4744, 4746-4749, + 4752-4784, 4786-4789, 4792-4798, 4800, 4802-4805, 4808-4822, 4824-4880, + 4882-4885, 4888-4954, 4957-4988, 4992-5017, 5024-5109, 5112-5117, 5120-5759, + 5761-5788, 5792-5880, 5888-5909, 5919-5942, 5952-5971, 5984-5996, 5998-6000, + 6002-6003, 6016-6109, 6112-6121, 6128-6137, 6144-6157, 6159-6169, 6176-6264, + 6272-6314, 6320-6389, 6400-6430, 6432-6443, 6448-6459, 6464, 6468-6509, + 6512-6516, 6528-6571, 6576-6601, 6608-6618, 6622-6683, 6686-6750, 6752-6780, + 6783-6793, 6800-6809, 6816-6829, 6832-6877, 6880-6891, 6912-6988, 6990-7155, + 7164-7223, 7227-7241, 7245-7306, 7312-7354, 7357-7367, 7376-7418, 7424-7957, + 7960-7965, 7968-8005, 8008-8013, 8016-8023, 8025, 8027, 8029, 8031-8061, + 8064-8116, 8118-8132, 8134-8147, 8150-8155, 8157-8175, 8178-8180, 8182-8190, + 8208-8231, 8240-8286, 8304-8305, 8308-8334, 8336-8348, 8352-8385, 8400-8432, + 8448-8587, 8592-9257, 9280-9290, 9312-11123, 11126-11507, 11513-11557, 11559, + 11565, 11568-11623, 11631-11632, 11647-11670, 11680-11686, 11688-11694, + 11696-11702, 11704-11710, 11712-11718, 11720-11726, 11728-11734, 11736-11742, + 11744-11869, 11904-11929, 11931-12019, 12032-12245, 12272-12287, 12289-12351, + 12353-12438, 12441-12543, 12549-12591, 12593-12686, 12688-12773, 12783-12830, + 12832-42124, 42128-42182, 42192-42539, 42560-42743, 42752-42972, 42993-43052, + 43056-43065, 43072-43127, 43136-43205, 43214-43225, 43232-43347, 43359-43388, + 43392-43469, 43471-43481, 43486-43518, 43520-43574, 43584-43597, 43600-43609, + 43612-43714, 43739-43766, 43777-43782, 43785-43790, 43793-43798, 43808-43814, + 43816-43822, 43824-43883, 43888-44013, 44016-44025, 44032-55203, 55216-55238, + 55243-55291, 63744-64109, 64112-64217, 64256-64262, 64275-64279, 64285-64310, + 64312-64316, 64318, 64320-64321, 64323-64324, 64326-64975, 65008-65049, + 65056-65106, 65108-65126, 65128-65131, 65136-65140, 65142-65276, 65281-65470, + 65474-65479, 65482-65487, 65490-65495, 65498-65500, 65504-65510, 65512-65518, + 65532-65533, 65536-65547, 65549-65574, 65576-65594, 65596-65597, 65599-65613, + 65616-65629, 65664-65786, 65792-65794, 65799-65843, 65847-65934, 65936-65948, + 65952, 66000-66045, 66176-66204, 66208-66256, 66272-66299, 66304-66339, + 66349-66378, 66384-66426, 66432-66461, 66463-66499, 66504-66517, 66560-66717, + 66720-66729, 66736-66771, 66776-66811, 66816-66855, 66864-66915, 66927-66938, + 66940-66954, 66956-66962, 66964-66965, 66967-66977, 66979-66993, 66995-67001, + 67003-67004, 67008-67059, 67072-67382, 67392-67413, 67424-67431, 67456-67461, + 67463-67504, 67506-67514, 67584-67589, 67592, 67594-67637, 67639-67640, + 67644, 67647-67669, 67671-67742, 67751-67759, 67808-67826, 67828-67829, + 67835-67867, 67871-67897, 67903-67929, 67968-68023, 68028-68047, 68050-68099, + 68101-68102, 68108-68115, 68117-68119, 68121-68149, 68152-68154, 68159-68168, + 68176-68184, 68192-68255, 68288-68326, 68331-68342, 68352-68405, 68409-68437, + 68440-68466, 68472-68497, 68505-68508, 68521-68527, 68608-68680, 68736-68786, + 68800-68850, 68858-68903, 68912-68921, 68928-68965, 68969-68997, 69006-69007, + 69216-69246, 69248-69289, 69291-69293, 69296-69297, 69314-69319, 69328-69336, + 69370-69415, 69424-69465, 69488-69513, 69552-69579, 69600-69622, 69632-69709, + 69714-69749, 69759-69820, 69822-69826, 69840-69864, 69872-69881, 69888-69940, + 69942-69959, 69968-70006, 70016-70111, 70113-70132, 70144-70161, 70163-70209, + 70272-70278, 70280, 70282-70285, 70287-70301, 70303-70313, 70320-70378, + 70384-70393, 70400-70403, 70405-70412, 70415-70416, 70419-70440, 70442-70448, + 70450-70451, 70453-70457, 70459-70468, 70471-70472, 70475-70477, 70480, + 70487, 70493-70499, 70502-70508, 70512-70516, 70528-70537, 70539, 70542, + 70544-70581, 70583-70592, 70594, 70597, 70599-70602, 70604-70613, + 70615-70616, 70625-70626, 70656-70747, 70749-70753, 70784-70855, 70864-70873, + 71040-71093, 71096-71133, 71168-71236, 71248-71257, 71264-71276, 71296-71353, + 71360-71369, 71376-71395, 71424-71450, 71453-71467, 71472-71494, 71680-71739, + 71840-71922, 71935-71942, 71945, 71948-71955, 71957-71958, 71960-71989, + 71991-71992, 71995-72006, 72016-72025, 72096-72103, 72106-72151, 72154-72164, + 72192-72263, 72272-72354, 72368-72440, 72448-72457, 72544-72551, 72640-72673, + 72688-72697, 72704-72712, 72714-72758, 72760-72773, 72784-72812, 72816-72847, + 72850-72871, 72873-72886, 72960-72966, 72968-72969, 72971-73014, 73018, + 73020-73021, 73023-73031, 73040-73049, 73056-73061, 73063-73064, 73066-73102, + 73104-73105, 73107-73112, 73120-73129, 73136-73179, 73184-73193, 73440-73464, + 73472-73488, 73490-73530, 73534-73562, 73648, 73664-73713, 73727-74649, + 74752-74862, 74864-74868, 74880-75075, 77712-77810, 77824-78895, 78912-78933, + 78944-82938, 82944-83526, 90368-90425, 92160-92728, 92736-92766, 92768-92777, + 92782-92862, 92864-92873, 92880-92909, 92912-92917, 92928-92997, 93008-93017, + 93019-93025, 93027-93047, 93053-93071, 93504-93561, 93760-93850, 93856-93880, + 93883-93907, 93952-94026, 94031-94087, 94095-94111, 94176-94180, 94192-94198, + 94208-101589, 101631-101662, 101760-101874, 110576-110579, 110581-110587, + 110589-110590, 110592-110882, 110898, 110928-110930, 110933, 110948-110951, + 110960-111355, 113664-113770, 113776-113788, 113792-113800, 113808-113817, + 113820-113823, 117760-118012, 118016-118451, 118458-118480, 118496-118512, + 118528-118573, 118576-118598, 118608-118723, 118784-119029, 119040-119078, + 119081-119154, 119163-119274, 119296-119365, 119488-119507, 119520-119539, + 119552-119638, 119648-119672, 119808-119892, 119894-119964, 119966-119967, + 119970, 119973-119974, 119977-119980, 119982-119993, 119995, 119997-120003, + 120005-120069, 120071-120074, 120077-120084, 120086-120092, 120094-120121, + 120123-120126, 120128-120132, 120134, 120138-120144, 120146-120485, + 120488-120779, 120782-121483, 121499-121503, 121505-121519, 122624-122654, + 122661-122666, 122880-122886, 122888-122904, 122907-122913, 122915-122916, + 122918-122922, 122928-122989, 123023, 123136-123180, 123184-123197, + 123200-123209, 123214-123215, 123536-123566, 123584-123641, 123647, + 124112-124153, 124368-124410, 124415, 124608-124638, 124640-124661, + 124670-124671, 124896-124902, 124904-124907, 124909-124910, 124912-124926, + 124928-125124, 125127-125142, 125184-125259, 125264-125273, 125278-125279, + 126065-126132, 126209-126269, 126464-126467, 126469-126495, 126497-126498, + 126500, 126503, 126505-126514, 126516-126519, 126521, 126523, 126530, 126535, + 126537, 126539, 126541-126543, 126545-126546, 126548, 126551, 126553, 126555, + 126557, 126559, 126561-126562, 126564, 126567-126570, 126572-126578, + 126580-126583, 126585-126588, 126590, 126592-126601, 126603-126619, + 126625-126627, 126629-126633, 126635-126651, 126704-126705, 126976-127019, + 127024-127123, 127136-127150, 127153-127167, 127169-127183, 127185-127221, + 127232-127405, 127462-127490, 127504-127547, 127552-127560, 127568-127569, + 127584-127589, 127744-128728, 128732-128748, 128752-128764, 128768-128985, + 128992-129003, 129008, 129024-129035, 129040-129095, 129104-129113, + 129120-129159, 129168-129197, 129200-129211, 129216-129217, 129232-129240, + 129280-129623, 129632-129645, 129648-129660, 129664-129674, 129678-129734, + 129736, 129741-129756, 129759-129770, 129775-129784, 129792-129938, + 129940-130042, 131072-173791, 173824-178205, 178208-183981, 183984-191456, + 191472-192093, 194560-195101, 196608-201546, 201552-210041, + 917760-917999 + |}] +;; + +let%expect_test "punct" = + Format.printf "%a@." Cset.pp Cset.punct; + [%expect + {| + 33-35, 37-42, 44-47, 58-59, 63-64, 91-93, 95, 123, 125, 161, 167, 171, + 182-183, 187, 191, 894, 903, 1370-1375, 1417-1418, 1470, 1472, 1475, 1478, + 1523-1524, 1545-1546, 1548-1549, 1563, 1565-1567, 1642-1645, 1748, 1792-1805, + 2039-2041, 2096-2110, 2142, 2404-2405, 2416, 2557, 2678, 2800, 3191, 3204, + 3572, 3663, 3674-3675, 3844-3858, 3860, 3898-3901, 3973, 4048-4052, + 4057-4058, 4170-4175, 4347, 4960-4968, 5120, 5742, 5787-5788, 5867-5869, + 5941-5942, 6100-6102, 6104-6106, 6144-6154, 6468-6469, 6686-6687, 6816-6822, + 6824-6829, 6990-6991, 7002-7008, 7037-7039, 7164-7167, 7227-7231, 7294-7295, + 7360-7367, 7379, 8208-8231, 8240-8259, 8261-8273, 8275-8286, 8317-8318, + 8333-8334, 8968-8971, 9001-9002, 10088-10101, 10181-10182, 10214-10223, + 10627-10648, 10712-10715, 10748-10749, 11513-11516, 11518-11519, 11632, + 11776-11822, 11824-11855, 11858-11869, 12289-12291, 12296-12305, 12308-12319, + 12336, 12349, 12448, 12539, 42238-42239, 42509-42511, 42611, 42622, + 42738-42743, 43124-43127, 43214-43215, 43256-43258, 43260, 43310-43311, + 43359, 43457-43469, 43486-43487, 43612-43615, 43742-43743, 43760-43761, + 44011, 64830-64831, 65040-65049, 65072-65106, 65108-65121, 65123, 65128, + 65130-65131, 65281-65283, 65285-65290, 65292-65295, 65306-65307, 65311-65312, + 65339-65341, 65343, 65371, 65373, 65375-65381, 65792-65794, 66463, 66512, + 66927, 67671, 67871, 67903, 68176-68184, 68223, 68336-68342, 68409-68415, + 68505-68508, 68974, 69293, 69328, 69461-69465, 69510-69513, 69703-69709, + 69819-69820, 69822-69825, 69952-69955, 70004-70005, 70085-70088, 70093, + 70107, 70109-70111, 70200-70205, 70313, 70612-70613, 70615-70616, + 70731-70735, 70746-70747, 70749, 70854, 71105-71127, 71233-71235, + 71264-71276, 71353, 71484-71486, 71739, 72004-72006, 72162, 72255-72262, + 72346-72348, 72350-72354, 72448-72457, 72673, 72769-72773, 72816-72817, + 73463-73464, 73539-73551, 73727, 74864-74868, 77809-77810, 92782-92783, + 92917, 92983-92987, 92996, 93549-93551, 93847-93850, 94178, 113823, + 121479-121483, 124415, + 125278-125279 + |}] +;; + +let%expect_test "cany" = + Format.printf "%a@." Cset.pp Cset.cany; + [%expect {| + 0-55295, + 57344-1114111 + |}] +;; + +let%expect_test "case_insens" = + let cset = Cset.diff (Cset.case_insens Cset.lower) (Cset.case_insens Cset.upper) in + Format.printf "%a@." Cset.pp cset; + [%expect {| + 91-96, 149-170, 181, 186, 191-215, 223, 257, 259, 261, 263, 265, 267, 269, + 271, 273, 275, 277, 279, 281, 283, 285, 287, 289, 291, 293, 295, 297, 299, + 301, 303, 305, 307, 309, 311, 331, 333, 335, 337, 339, 341, 343, 355, 357, + 359-361, 363, 365, 367, 369, 371, 373, 375, 382-387, 389, 394-397, 400-402, + 404-405, 413-414, 419, 421, 424, 429, 434-436, 438, 440-442, 449-453, 456, + 459, 464, 468, 470, 474, 479, 481, 483, 485, 489, 491, 495-498, 501, 503-505, + 507, 509, 511, 513, 515, 517, 519, 521, 523, 525, 527, 531, 533, 537, 539, + 541, 543, 545, 547, 549, 551, 553, 555, 557, 559, 561, 563, 575, 583, 585, + 587, 589, 591, 662-696, 704-705, 736-740, 837, 849-881, 883, 887, 890-893, + 909, 930, 976-977, 981-983, 985, 987, 989, 991, 993, 995, 997, 999, 1001, + 1003, 1005, 1013, 1019, 1040-1119, 1121, 1123, 1125, 1127, 1129, 1131, 1133, + 1135, 1137, 1139, 1141, 1143, 1145, 1147, 1149, 1151, 1153, 1163, 1165, 1167, + 1169, 1171, 1173, 1175, 1177, 1179, 1181, 1183, 1185, 1187, 1189, 1191, 1193, + 1195, 1197, 1199, 1201, 1203, 1205, 1207, 1209, 1211, 1213, 1215, 1233, 1235, + 1237, 1239, 1241, 1243, 1245, 1247, 1251, 1253, 1255, 1257, 1259, 1261, 1263, + 1265, 1267, 1269, 1271, 1273, 1275, 1277, 1279, 1281, 1283, 1285, 1287, 1289, + 1291, 1293, 1295, 1297, 1299, 1301, 1303, 1305, 1307, 1309, 1311, 1313, 1315, + 1317, 1319, 1321, 1323, 1325, 1327, 4272-4346, 4348-4351, 5080-5117, + 7264-7304, 7306, 7649-7681, 7683, 7685, 7687, 7689, 7691, 7693, 7695, 7697, + 7699, 7701, 7703, 7705, 7707, 7709, 7711, 7713, 7715, 7717, 7719, 7721, 7723, + 7725, 7727, 7729, 7731, 7733, 7735, 7737, 7739, 7741, 7743, 7745, 7747, 7749, + 7751, 7753, 7755, 7757, 7759, 7761, 7763, 7765, 7767, 7769, 7771, 7773, 7775, + 7777, 7779, 7781, 7783, 7785, 7787, 7789, 7791, 7793, 7795, 7797, 7799, 7801, + 7803, 7805, 7807, 7809, 7811, 7813, 7815, 7817, 7819, 7821, 7823, 7825, 7827, + 7829, 7839, 7841, 7843, 7845, 7847, 7849, 7851, 7853, 7855, 7857, 7859, 7861, + 7863, 7865, 7867, 7869, 7871, 7873, 7875, 7877, 7879, 7881, 7883, 7885, 7887, + 7889, 7891, 7893, 7895, 7897, 7899, 7901, 7903, 7905, 7907, 7909, 7911, 7913, + 7915, 7917, 7919, 7921, 7923, 7925, 7927, 7929, 7931, 7933, 7935, 7952-7975, + 7984-7991, 8000-8007, 8032-8039, 8048-8056, 8064-8071, 8080-8087, 8094-8103, + 8112-8116, 8118-8119, 8126, 8128-8135, 8144-8151, 8160-8167, 8178-8180, + 8182-8183, 8319, 8336-8348, 8426-8454, 8456-8458, 8462-8463, 8467-8468, + 8470-8472, 8478-8481, 8485, 8489, 8494-8495, 8500, 8508-8509, 8526, + 8548-8575, 9424-9429, 11280-11359, 11361, 11365-11366, 11368, 11370, 11372, + 11377, 11379-11380, 11382-11389, 11393, 11397, 11401, 11403, 11405, 11407, + 11409, 11411, 11415, 11417, 11419, 11421, 11425, 11427, 11429, 11431, 11433, + 11435, 11437, 11439, 11441, 11443, 11445, 11447, 11449, 11451, 11453, 11455, + 11457, 11459, 11461, 11463, 11465, 11467, 11469, 11471, 11473, 11475, 11477, + 11479, 11481, 11483, 11485, 11487, 11489, 11491, 11507, 11559, 11565, + 42529-42561, 42563, 42565, 42567, 42569, 42571, 42573, 42575, 42577, 42579, + 42581, 42583, 42585, 42587, 42589, 42591, 42593, 42595, 42597, 42599, 42601, + 42603, 42605, 42625, 42627, 42629, 42631, 42633, 42635, 42637, 42639, 42641, + 42643, 42645, 42647, 42649, 42651-42653, 42787, 42789, 42791, 42793, 42795, + 42797, 42799-42801, 42803, 42805, 42807, 42809, 42811, 42813, 42815, 42817, + 42819, 42821, 42823, 42825, 42827, 42829, 42831, 42833, 42835, 42837, 42839, + 42841, 42843, 42845, 42847, 42849, 42851, 42853, 42855, 42857, 42859, 42861, + 42863, 42879, 42881, 42883, 42885, 42887, 42894-42897, 42899-42901, 42903, + 42907, 42909, 42911, 42913, 42915, 42917, 42919, 42921, 42927, 42933, 42935, + 42937, 42939, 42941, 42943, 42945, 42947, 42954, 42957, 42959, 42963, 42965, + 42967, 42969, 42971, 42998, 43000-43002, 43792-43967, 64224-64262, + 64275-64279, 65313-65344, 66568-66639, 66744-66811, 66935-66939, 66955, + 66979-66993, 66995-67001, 67003-67004, 67456, 67459-67461, 67463-67504, + 67506-67514, 68768-68850, 68944-68975, 71840-71903, 93760-93823, 93851-93887, + 119802-119839, 119886-119891, 119894-119943, 119965-119972, 119975-119976, + 119981, 119990-119993, 119995, 120005-120047, 120075-120076, 120085, + 120093-120099, 120127, 120133, 120135-120137, 120145-120151, 120198-120203, + 120250-120255, 120302-120307, 120354-120359, 120406-120411, 120458-120463, + 120513-120519, 120540-120577, 120598-120635, 120656-120693, 120714-120751, + 120772-120777, 120779, 122661-122666, 122928-122989, + 125186-125251 + |}] +;; + +let%expect_test "one_char" = + let test set = + let pp fmt cp = + Fmt.(opt Cset.CodePage.pp) fmt cp + in + Format.printf "%a@." pp (Cset.one_c set) + in + test Cset.empty; + [%expect {| |}]; + test (Cset.CodePage.of_char 'c' |> Cset.csingle ); + [%expect {| 99 |}]; + test Cset.cany; + [%expect {| |}] +;; + +let%expect_test "is_empty" = + let test set = Format.printf "%a@." Fmt.bool (Cset.is_empty set) in + test Cset.empty; + [%expect {| true |}]; + test (Cset.CodePage.of_char 'c' |> Cset.csingle); + [%expect {| false |}]; + test Cset.cany; + [%expect {| false |}] +;; + +let%expect_test "Cset mem" = + let test set c = Format.printf "%a@." Fmt.bool (Cset.mem c set) in + test Cset.cany Cset.CodePage.null; + [%expect {| false |}]; + test Cset.cany (Cset.CodePage.of_char 'a' |> Cset.CodePage.from_letter); + [%expect {| true |}]; + let c = Cset.CodePage.of_char 'c' |> Cset.csingle in + test c (Cset.CodePage.of_char 'c' |> Cset.CodePage.from_letter); + [%expect {| true |}]; + test c (Cset.CodePage.of_char '.' |> Cset.CodePage.from_letter); + [%expect {| false |}] +;; diff --git a/lib_test/unicode/expect/test_emacs.ml b/lib_test/unicode/expect/test_emacs.ml new file mode 100644 index 00000000..f3d22e59 --- /dev/null +++ b/lib_test/unicode/expect/test_emacs.ml @@ -0,0 +1,430 @@ +open Import + +(* + * Tests based on description of emacs regular expressions given at + * http://www.gnu.org/manual/elisp-manual-20-2.5/html_chapter/elisp_34.html + *) + +let re re = Format.printf "%a@." Re.pp (Re.Emacs.re re) + +let%expect_test "not supported" = + let re s = + try ignore (Re.Emacs.re s) with + | Re_private_unicode.Emacs.Parse_error -> print_endline "Parse error" + | Re_private_unicode.Emacs.Not_supported -> print_endline "Not supported" + in + re "*ab"; + [%expect {| Parse error |}]; + re "+ab"; + [%expect {| Parse error |}]; + re "?ab"; + [%expect {| Parse error |}]; + re "\\0"; + [%expect {| Not supported |}] +;; + +let%expect_test "escaping special characters" = + re "\\."; + [%expect {| (Set 46) |}]; + re "\\*"; + [%expect {| (Set 42) |}]; + re "\\+"; + [%expect {| (Set 43) |}]; + re "\\?"; + [%expect {| (Set 63) |}]; + re "\\["; + [%expect {| (Set 91) |}]; + re "\\]"; + [%expect {| (Set 93) |}]; + re "\\^"; + [%expect {| (Set 94) |}]; + re "\\$"; + [%expect {| (Set 36) |}]; + re "\\\\"; + [%expect {| (Set 92) |}] +;; + +let%expect_test "special characeters" = + re "."; + [%expect {| (Set 0-9, 14-132, 134-8231, 8234-55295, 57344-1114111) |}]; + re "a*"; + [%expect {| (Repeat (Set 97) 0) |}]; + re "a+"; + [%expect {| (Repeat (Set 97) 1) |}]; + re "a?"; + [%expect {| (Repeat (Set 97) 0 1) |}]; + re "[ab]"; + [%expect {| (Alternative (Set 98)(Set 97)) |}]; + re "[a-z]"; + [%expect {| (Set 97-122) |}]; + re "[a-z$%.]"; + [%expect {| (Alternative (Set 46)(Set 37)(Set 36)(Set 97-122)) |}]; + re "[]a]"; + [%expect {| (Alternative (Set 97)(Set 93)) |}]; + re "[]-]"; + [%expect {| (Alternative (Set 93)(Set 45)) |}]; + re "[a^]"; + [%expect {| (Alternative (Set 94)(Set 97)) |}]; + re "[^a-z]"; + [%expect {| (Complement (Set 97-122)) |}]; + re "[^a-z$]"; + [%expect {| (Complement (Set 36)(Set 97-122)) |}]; + re "^"; + [%expect {| Beg_of_line |}]; + re "$"; + [%expect {| End_of_line |}] +;; + +let%expect_test "alternatives" = + re "a\\|b"; + [%expect {| (Alternative (Set 97)(Set 98)) |}]; + re "aa\\|bb"; + [%expect {| (Alternative (Sequence (Set 97)(Set 97))(Sequence (Set 98)(Set 98))) |}] +;; + +let%expect_test "contexts" = + re "\\`"; + [%expect {| Beg_of_str |}]; + re "\\'"; + [%expect {| End_of_str |}]; + re "\\="; + [%expect {| Start |}]; + re "\\b"; + [%expect {| (Alternative Beg_of_wordEnd_of_word) |}]; + re "\\B"; + [%expect {| Not_bound |}]; + re "\\<"; + [%expect {| Beg_of_word |}]; + re "\\>"; + [%expect {| End_of_word |}] +;; + +let%expect_test "word-constituent" = + re "\\w"; + [%expect + {| + (Alternative + (Set 48-57, 65-90, 97-122, 170, 178-179, 181, 185-186, 188-190, 192-214, + 216-246, 248-705, 710-721, 736-740, 748, 750, 837, 867-884, 886-887, + 890-893, 895, 902, 904-906, 908, 910-929, 931-1013, 1015-1153, + 1162-1327, 1329-1366, 1369, 1376-1416, 1456-1469, 1471, 1473-1474, + 1476-1477, 1479, 1488-1514, 1519-1522, 1552-1562, 1568-1623, 1625-1641, + 1646-1747, 1749-1756, 1761-1768, 1773-1788, 1791, 1808-1855, 1869-1969, + 1984-2026, 2036-2037, 2042, 2048-2071, 2074-2092, 2112-2136, 2144-2154, + 2160-2183, 2185-2191, 2199, 2208-2249, 2260-2271, 2275-2281, 2288-2363, + 2365-2380, 2382-2384, 2389-2403, 2406-2415, 2417-2435, 2437-2444, + 2447-2448, 2451-2472, 2474-2480, 2482, 2486-2489, 2493-2500, 2503-2504, + 2507-2508, 2510, 2519, 2524-2525, 2527-2531, 2534-2545, 2548-2553, + 2556, 2561-2563, 2565-2570, 2575-2576, 2579-2600, 2602-2608, 2610-2611, + 2613-2614, 2616-2617, 2622-2626, 2631-2632, 2635-2636, 2641, 2649-2652, + 2654, 2662-2677, 2689-2691, 2693-2701, 2703-2705, 2707-2728, 2730-2736, + 2738-2739, 2741-2745, 2749-2757, 2759-2761, 2763-2764, 2768, 2784-2787, + 2790-2799, 2809-2812, 2817-2819, 2821-2828, 2831-2832, 2835-2856, + 2858-2864, 2866-2867, 2869-2873, 2877-2884, 2887-2888, 2891-2892, + 2902-2903, 2908-2909, 2911-2915, 2918-2927, 2929-2935, 2946-2947, + 2949-2954, 2958-2960, 2962-2965, 2969-2970, 2972, 2974-2975, 2979-2980, + 2984-2986, 2990-3001, 3006-3010, 3014-3016, 3018-3020, 3024, 3031, + 3046-3058, 3072-3084, 3086-3088, 3090-3112, 3114-3129, 3133-3140, + 3142-3144, 3146-3148, 3157-3158, 3160-3162, 3164-3165, 3168-3171, + 3174-3183, 3192-3198, 3200-3203, 3205-3212, 3214-3216, 3218-3240, + 3242-3251, 3253-3257, 3261-3268, 3270-3272, 3274-3276, 3285-3286, + 3292-3294, 3296-3299, 3302-3311, 3313-3315, 3328-3340, 3342-3344, + 3346-3386, 3389-3396, 3398-3400, 3402-3404, 3406, 3412-3427, 3430-3448, + 3450-3455, 3457-3459, 3461-3478, 3482-3505, 3507-3515, 3517, 3520-3526, + 3535-3540, 3542, 3544-3551, 3558-3567, 3570-3571, 3585-3642, 3648-3654, + 3661, 3664-3673, 3713-3714, 3716, 3718-3722, 3724-3747, 3749, + 3751-3769, 3771-3773, 3776-3780, 3782, 3789, 3792-3801, 3804-3807, + 3840, 3872-3891, 3904-3911, 3913-3948, 3953-3971, 3976-3991, 3993-4028, + 4096-4150, 4152, 4155-4169, 4176-4253, 4256-4293, 4295, 4301, + 4304-4346, 4348-4680, 4682-4685, 4688-4694, 4696, 4698-4701, 4704-4744, + 4746-4749, 4752-4784, 4786-4789, 4792-4798, 4800, 4802-4805, 4808-4822, + 4824-4880, 4882-4885, 4888-4954, 4969-4988, 4992-5007, 5024-5109, + 5112-5117, 5121-5740, 5743-5759, 5761-5786, 5792-5866, 5870-5880, + 5888-5907, 5919-5939, 5952-5971, 5984-5996, 5998-6000, 6002-6003, + 6016-6067, 6070-6088, 6103, 6108, 6112-6121, 6128-6137, 6160-6169, + 6176-6264, 6272-6314, 6320-6389, 6400-6430, 6432-6443, 6448-6456, + 6470-6509, 6512-6516, 6528-6571, 6576-6601, 6608-6618, 6656-6683, + 6688-6750, 6753-6772, 6784-6793, 6800-6809, 6823, 6847-6848, 6860-6862, + 6912-6963, 6965-6979, 6981-6988, 6992-7001, 7040-7081, 7084-7141, + 7143-7153, 7168-7222, 7232-7241, 7245-7293, 7296-7306, 7312-7354, + 7357-7359, 7401-7404, 7406-7411, 7413-7414, 7418, 7424-7615, 7635-7668, + 7680-7957, 7960-7965, 7968-8005, 8008-8013, 8016-8023, 8025, 8027, + 8029, 8031-8061, 8064-8116, 8118-8124, 8126, 8130-8132, 8134-8140, + 8144-8147, 8150-8155, 8160-8172, 8178-8180, 8182-8188, 8304-8305, + 8308-8313, 8319-8329, 8336-8348, 8450, 8455, 8458-8467, 8469, + 8473-8477, 8484, 8486, 8488, 8490-8493, 8495-8505, 8508-8511, + 8517-8521, 8526, 8528-8585, 9312-9371, 9398-9471, 10102-10131, + 11264-11492, 11499-11502, 11506-11507, 11517, 11520-11557, 11559, + 11565, 11568-11623, 11631, 11648-11670, 11680-11686, 11688-11694, + 11696-11702, 11704-11710, 11712-11718, 11720-11726, 11728-11734, + 11736-11742, 11744-11775, 11823, 12293-12295, 12321-12329, 12337-12341, + 12344-12348, 12353-12438, 12445-12447, 12449-12538, 12540-12543, + 12549-12591, 12593-12686, 12690-12693, 12704-12735, 12784-12799, + 12832-12841, 12872-12879, 12881-12895, 12928-12937, 12977-12991, + 13312-19903, 19968-42124, 42192-42237, 42240-42508, 42512-42539, + 42560-42606, 42612-42619, 42623-42735, 42775-42783, 42786-42888, + 42891-42972, 42993-43013, 43015-43047, 43056-43061, 43072-43123, + 43136-43203, 43205, 43216-43225, 43250-43255, 43259, 43261-43306, + 43312-43346, 43360-43388, 43392-43442, 43444-43455, 43471-43481, + 43488-43518, 43520-43574, 43584-43597, 43600-43609, 43616-43638, + 43642-43710, 43712, 43714, 43739-43741, 43744-43759, 43762-43765, + 43777-43782, 43785-43790, 43793-43798, 43808-43814, 43816-43822, + 43824-43866, 43868-43881, 43888-44010, 44016-44025, 44032-55203, + 55216-55238, 55243-55291, 63744-64109, 64112-64217, 64256-64262, + 64275-64279, 64285-64296, 64298-64310, 64312-64316, 64318, 64320-64321, + 64323-64324, 64326-64433, 64467-64829, 64848-64911, 64914-64967, + 65008-65019, 65136-65140, 65142-65276, 65296-65305, 65313-65338, + 65345-65370, 65382-65470, 65474-65479, 65482-65487, 65490-65495, + 65498-65500, 65536-65547, 65549-65574, 65576-65594, 65596-65597, + 65599-65613, 65616-65629, 65664-65786, 65799-65843, 65856-65912, + 65930-65931, 66176-66204, 66208-66256, 66273-66299, 66304-66339, + 66349-66378, 66384-66426, 66432-66461, 66464-66499, 66504-66511, + 66513-66517, 66560-66717, 66720-66729, 66736-66771, 66776-66811, + 66816-66855, 66864-66915, 66928-66938, 66940-66954, 66956-66962, + 66964-66965, 66967-66977, 66979-66993, 66995-67001, 67003-67004, + 67008-67059, 67072-67382, 67392-67413, 67424-67431, 67456-67461, + 67463-67504, 67506-67514, 67584-67589, 67592, 67594-67637, 67639-67640, + 67644, 67647-67669, 67672-67702, 67705-67742, 67751-67759, 67808-67826, + 67828-67829, 67835-67867, 67872-67897, 67904-67929, 67968-68023, + 68028-68047, 68050-68099, 68101-68102, 68108-68115, 68117-68119, + 68121-68149, 68160-68168, 68192-68222, 68224-68255, 68288-68295, + 68297-68324, 68331-68335, 68352-68405, 68416-68437, 68440-68466, + 68472-68497, 68521-68527, 68608-68680, 68736-68786, 68800-68850, + 68858-68903, 68912-68921, 68928-68965, 68969, 68975-68997, 69216-69246, + 69248-69289, 69291-69292, 69296-69297, 69314-69319, 69370-69372, + 69376-69415, 69424-69445, 69457-69460, 69488-69505, 69552-69579, + 69600-69622, 69632-69701, 69714-69743, 69745-69749, 69760-69816, 69826, + 69840-69864, 69872-69881, 69888-69938, 69942-69951, 69956-69959, + 69968-70002, 70006, 70016-70079, 70081-70084, 70094-70106, 70108, + 70113-70132, 70144-70161, 70163-70196, 70199, 70206-70209, 70272-70278, + 70280, 70282-70285, 70287-70301, 70303-70312, 70320-70376, 70384-70393, + 70400-70403, 70405-70412, 70415-70416, 70419-70440, 70442-70448, + 70450-70451, 70453-70457, 70461-70468, 70471-70472, 70475-70476, 70480, + 70487, 70493-70499, 70528-70537, 70539, 70542, 70544-70581, + 70583-70592, 70594, 70597, 70599-70602, 70604-70605, 70609, 70611, + 70656-70721, 70723-70725, 70727-70730, 70736-70745, 70751-70753, + 70784-70849, 70852-70853, 70855, 70864-70873, 71040-71093, 71096-71102, + 71128-71133, 71168-71230, 71232, 71236, 71248-71257, 71296-71349, + 71352, 71360-71369, 71376-71395, 71424-71450, 71453-71466, 71472-71483, + 71488-71494, 71680-71736, 71840-71922, 71935-71942, 71945, 71948-71955, + 71957-71958, 71960-71989, 71991-71992, 71995-71996, 71999-72002, + 72016-72025, 72096-72103, 72106-72151, 72154-72159, 72161, 72163-72164, + 72192-72242, 72245-72254, 72272-72343, 72349, 72368-72440, 72544-72551, + 72640-72672, 72688-72697, 72704-72712, 72714-72758, 72760-72766, 72768, + 72784-72812, 72818-72847, 72850-72871, 72873-72886, 72960-72966, + 72968-72969, 72971-73014, 73018, 73020-73021, 73023-73025, 73027, + 73030-73031, 73040-73049, 73056-73061, 73063-73064, 73066-73102, + 73104-73105, 73107-73110, 73112, 73120-73129, 73136-73179, 73184-73193, + 73440-73462, 73472-73488, 73490-73530, 73534-73536, 73552-73561, 73648, + 73664-73684, 73728-74649, 74752-74862, 74880-75075, 77712-77808, + 77824-78895, 78913-78918, 78944-82938, 82944-83526, 90368-90414, + 90416-90425, 92160-92728, 92736-92766, 92768-92777, 92784-92862, + 92864-92873, 92880-92909, 92928-92975, 92992-92995, 93008-93017, + 93019-93025, 93027-93047, 93053-93071, 93504-93548, 93552-93561, + 93760-93846, 93856-93880, 93883-93907, 93952-94026, 94031-94087, + 94095-94111, 94176-94177, 94179, 94192-94198, 94208-101589, + 101631-101662, 101760-101874, 110576-110579, 110581-110587, + 110589-110590, 110592-110882, 110898, 110928-110930, 110933, + 110948-110951, 110960-111355, 113664-113770, 113776-113788, + 113792-113800, 113808-113817, 113822, 118000-118009, 119488-119507, + 119520-119539, 119648-119672, 119808-119892, 119894-119964, + 119966-119967, 119970, 119973-119974, 119977-119980, 119982-119993, + 119995, 119997-120003, 120005-120069, 120071-120074, 120077-120084, + 120086-120092, 120094-120121, 120123-120126, 120128-120132, 120134, + 120138-120144, 120146-120485, 120488-120512, 120514-120538, + 120540-120570, 120572-120596, 120598-120628, 120630-120654, + 120656-120686, 120688-120712, 120714-120744, 120746-120770, + 120772-120779, 120782-120831, 122624-122654, 122661-122666, + 122880-122886, 122888-122904, 122907-122913, 122915-122916, + 122918-122922, 122928-122989, 123023, 123136-123180, 123191-123197, + 123200-123209, 123214, 123536-123565, 123584-123627, 123632-123641, + 124112-124139, 124144-124153, 124368-124397, 124400-124410, + 124608-124638, 124640-124661, 124670-124671, 124896-124902, + 124904-124907, 124909-124910, 124912-124926, 124928-125124, + 125127-125135, 125184-125251, 125255, 125259, 125264-125273, + 126065-126123, 126125-126127, 126129-126132, 126209-126253, + 126255-126269, 126464-126467, 126469-126495, 126497-126498, 126500, + 126503, 126505-126514, 126516-126519, 126521, 126523, 126530, 126535, + 126537, 126539, 126541-126543, 126545-126546, 126548, 126551, 126553, + 126555, 126557, 126559, 126561-126562, 126564, 126567-126570, + 126572-126578, 126580-126583, 126585-126588, 126590, 126592-126601, + 126603-126619, 126625-126627, 126629-126633, 126635-126651, + 127232-127244, 127280-127305, 127312-127337, 127344-127369, + 130032-130041, 131072-173791, 173824-178205, 178208-183981, + 183984-191456, 191472-192093, 194560-195101, 196608-201546, + 201552-210041) + (Set 95)) + |}]; + re "\\W"; + [%expect + {| + (Complement + (Set 48-57, 65-90, 97-122, 170, 178-179, 181, 185-186, 188-190, 192-214, + 216-246, 248-705, 710-721, 736-740, 748, 750, 837, 867-884, 886-887, + 890-893, 895, 902, 904-906, 908, 910-929, 931-1013, 1015-1153, + 1162-1327, 1329-1366, 1369, 1376-1416, 1456-1469, 1471, 1473-1474, + 1476-1477, 1479, 1488-1514, 1519-1522, 1552-1562, 1568-1623, 1625-1641, + 1646-1747, 1749-1756, 1761-1768, 1773-1788, 1791, 1808-1855, 1869-1969, + 1984-2026, 2036-2037, 2042, 2048-2071, 2074-2092, 2112-2136, 2144-2154, + 2160-2183, 2185-2191, 2199, 2208-2249, 2260-2271, 2275-2281, 2288-2363, + 2365-2380, 2382-2384, 2389-2403, 2406-2415, 2417-2435, 2437-2444, + 2447-2448, 2451-2472, 2474-2480, 2482, 2486-2489, 2493-2500, 2503-2504, + 2507-2508, 2510, 2519, 2524-2525, 2527-2531, 2534-2545, 2548-2553, + 2556, 2561-2563, 2565-2570, 2575-2576, 2579-2600, 2602-2608, 2610-2611, + 2613-2614, 2616-2617, 2622-2626, 2631-2632, 2635-2636, 2641, 2649-2652, + 2654, 2662-2677, 2689-2691, 2693-2701, 2703-2705, 2707-2728, 2730-2736, + 2738-2739, 2741-2745, 2749-2757, 2759-2761, 2763-2764, 2768, 2784-2787, + 2790-2799, 2809-2812, 2817-2819, 2821-2828, 2831-2832, 2835-2856, + 2858-2864, 2866-2867, 2869-2873, 2877-2884, 2887-2888, 2891-2892, + 2902-2903, 2908-2909, 2911-2915, 2918-2927, 2929-2935, 2946-2947, + 2949-2954, 2958-2960, 2962-2965, 2969-2970, 2972, 2974-2975, 2979-2980, + 2984-2986, 2990-3001, 3006-3010, 3014-3016, 3018-3020, 3024, 3031, + 3046-3058, 3072-3084, 3086-3088, 3090-3112, 3114-3129, 3133-3140, + 3142-3144, 3146-3148, 3157-3158, 3160-3162, 3164-3165, 3168-3171, + 3174-3183, 3192-3198, 3200-3203, 3205-3212, 3214-3216, 3218-3240, + 3242-3251, 3253-3257, 3261-3268, 3270-3272, 3274-3276, 3285-3286, + 3292-3294, 3296-3299, 3302-3311, 3313-3315, 3328-3340, 3342-3344, + 3346-3386, 3389-3396, 3398-3400, 3402-3404, 3406, 3412-3427, 3430-3448, + 3450-3455, 3457-3459, 3461-3478, 3482-3505, 3507-3515, 3517, 3520-3526, + 3535-3540, 3542, 3544-3551, 3558-3567, 3570-3571, 3585-3642, 3648-3654, + 3661, 3664-3673, 3713-3714, 3716, 3718-3722, 3724-3747, 3749, + 3751-3769, 3771-3773, 3776-3780, 3782, 3789, 3792-3801, 3804-3807, + 3840, 3872-3891, 3904-3911, 3913-3948, 3953-3971, 3976-3991, 3993-4028, + 4096-4150, 4152, 4155-4169, 4176-4253, 4256-4293, 4295, 4301, + 4304-4346, 4348-4680, 4682-4685, 4688-4694, 4696, 4698-4701, 4704-4744, + 4746-4749, 4752-4784, 4786-4789, 4792-4798, 4800, 4802-4805, 4808-4822, + 4824-4880, 4882-4885, 4888-4954, 4969-4988, 4992-5007, 5024-5109, + 5112-5117, 5121-5740, 5743-5759, 5761-5786, 5792-5866, 5870-5880, + 5888-5907, 5919-5939, 5952-5971, 5984-5996, 5998-6000, 6002-6003, + 6016-6067, 6070-6088, 6103, 6108, 6112-6121, 6128-6137, 6160-6169, + 6176-6264, 6272-6314, 6320-6389, 6400-6430, 6432-6443, 6448-6456, + 6470-6509, 6512-6516, 6528-6571, 6576-6601, 6608-6618, 6656-6683, + 6688-6750, 6753-6772, 6784-6793, 6800-6809, 6823, 6847-6848, 6860-6862, + 6912-6963, 6965-6979, 6981-6988, 6992-7001, 7040-7081, 7084-7141, + 7143-7153, 7168-7222, 7232-7241, 7245-7293, 7296-7306, 7312-7354, + 7357-7359, 7401-7404, 7406-7411, 7413-7414, 7418, 7424-7615, 7635-7668, + 7680-7957, 7960-7965, 7968-8005, 8008-8013, 8016-8023, 8025, 8027, + 8029, 8031-8061, 8064-8116, 8118-8124, 8126, 8130-8132, 8134-8140, + 8144-8147, 8150-8155, 8160-8172, 8178-8180, 8182-8188, 8304-8305, + 8308-8313, 8319-8329, 8336-8348, 8450, 8455, 8458-8467, 8469, + 8473-8477, 8484, 8486, 8488, 8490-8493, 8495-8505, 8508-8511, + 8517-8521, 8526, 8528-8585, 9312-9371, 9398-9471, 10102-10131, + 11264-11492, 11499-11502, 11506-11507, 11517, 11520-11557, 11559, + 11565, 11568-11623, 11631, 11648-11670, 11680-11686, 11688-11694, + 11696-11702, 11704-11710, 11712-11718, 11720-11726, 11728-11734, + 11736-11742, 11744-11775, 11823, 12293-12295, 12321-12329, 12337-12341, + 12344-12348, 12353-12438, 12445-12447, 12449-12538, 12540-12543, + 12549-12591, 12593-12686, 12690-12693, 12704-12735, 12784-12799, + 12832-12841, 12872-12879, 12881-12895, 12928-12937, 12977-12991, + 13312-19903, 19968-42124, 42192-42237, 42240-42508, 42512-42539, + 42560-42606, 42612-42619, 42623-42735, 42775-42783, 42786-42888, + 42891-42972, 42993-43013, 43015-43047, 43056-43061, 43072-43123, + 43136-43203, 43205, 43216-43225, 43250-43255, 43259, 43261-43306, + 43312-43346, 43360-43388, 43392-43442, 43444-43455, 43471-43481, + 43488-43518, 43520-43574, 43584-43597, 43600-43609, 43616-43638, + 43642-43710, 43712, 43714, 43739-43741, 43744-43759, 43762-43765, + 43777-43782, 43785-43790, 43793-43798, 43808-43814, 43816-43822, + 43824-43866, 43868-43881, 43888-44010, 44016-44025, 44032-55203, + 55216-55238, 55243-55291, 63744-64109, 64112-64217, 64256-64262, + 64275-64279, 64285-64296, 64298-64310, 64312-64316, 64318, 64320-64321, + 64323-64324, 64326-64433, 64467-64829, 64848-64911, 64914-64967, + 65008-65019, 65136-65140, 65142-65276, 65296-65305, 65313-65338, + 65345-65370, 65382-65470, 65474-65479, 65482-65487, 65490-65495, + 65498-65500, 65536-65547, 65549-65574, 65576-65594, 65596-65597, + 65599-65613, 65616-65629, 65664-65786, 65799-65843, 65856-65912, + 65930-65931, 66176-66204, 66208-66256, 66273-66299, 66304-66339, + 66349-66378, 66384-66426, 66432-66461, 66464-66499, 66504-66511, + 66513-66517, 66560-66717, 66720-66729, 66736-66771, 66776-66811, + 66816-66855, 66864-66915, 66928-66938, 66940-66954, 66956-66962, + 66964-66965, 66967-66977, 66979-66993, 66995-67001, 67003-67004, + 67008-67059, 67072-67382, 67392-67413, 67424-67431, 67456-67461, + 67463-67504, 67506-67514, 67584-67589, 67592, 67594-67637, 67639-67640, + 67644, 67647-67669, 67672-67702, 67705-67742, 67751-67759, 67808-67826, + 67828-67829, 67835-67867, 67872-67897, 67904-67929, 67968-68023, + 68028-68047, 68050-68099, 68101-68102, 68108-68115, 68117-68119, + 68121-68149, 68160-68168, 68192-68222, 68224-68255, 68288-68295, + 68297-68324, 68331-68335, 68352-68405, 68416-68437, 68440-68466, + 68472-68497, 68521-68527, 68608-68680, 68736-68786, 68800-68850, + 68858-68903, 68912-68921, 68928-68965, 68969, 68975-68997, 69216-69246, + 69248-69289, 69291-69292, 69296-69297, 69314-69319, 69370-69372, + 69376-69415, 69424-69445, 69457-69460, 69488-69505, 69552-69579, + 69600-69622, 69632-69701, 69714-69743, 69745-69749, 69760-69816, 69826, + 69840-69864, 69872-69881, 69888-69938, 69942-69951, 69956-69959, + 69968-70002, 70006, 70016-70079, 70081-70084, 70094-70106, 70108, + 70113-70132, 70144-70161, 70163-70196, 70199, 70206-70209, 70272-70278, + 70280, 70282-70285, 70287-70301, 70303-70312, 70320-70376, 70384-70393, + 70400-70403, 70405-70412, 70415-70416, 70419-70440, 70442-70448, + 70450-70451, 70453-70457, 70461-70468, 70471-70472, 70475-70476, 70480, + 70487, 70493-70499, 70528-70537, 70539, 70542, 70544-70581, + 70583-70592, 70594, 70597, 70599-70602, 70604-70605, 70609, 70611, + 70656-70721, 70723-70725, 70727-70730, 70736-70745, 70751-70753, + 70784-70849, 70852-70853, 70855, 70864-70873, 71040-71093, 71096-71102, + 71128-71133, 71168-71230, 71232, 71236, 71248-71257, 71296-71349, + 71352, 71360-71369, 71376-71395, 71424-71450, 71453-71466, 71472-71483, + 71488-71494, 71680-71736, 71840-71922, 71935-71942, 71945, 71948-71955, + 71957-71958, 71960-71989, 71991-71992, 71995-71996, 71999-72002, + 72016-72025, 72096-72103, 72106-72151, 72154-72159, 72161, 72163-72164, + 72192-72242, 72245-72254, 72272-72343, 72349, 72368-72440, 72544-72551, + 72640-72672, 72688-72697, 72704-72712, 72714-72758, 72760-72766, 72768, + 72784-72812, 72818-72847, 72850-72871, 72873-72886, 72960-72966, + 72968-72969, 72971-73014, 73018, 73020-73021, 73023-73025, 73027, + 73030-73031, 73040-73049, 73056-73061, 73063-73064, 73066-73102, + 73104-73105, 73107-73110, 73112, 73120-73129, 73136-73179, 73184-73193, + 73440-73462, 73472-73488, 73490-73530, 73534-73536, 73552-73561, 73648, + 73664-73684, 73728-74649, 74752-74862, 74880-75075, 77712-77808, + 77824-78895, 78913-78918, 78944-82938, 82944-83526, 90368-90414, + 90416-90425, 92160-92728, 92736-92766, 92768-92777, 92784-92862, + 92864-92873, 92880-92909, 92928-92975, 92992-92995, 93008-93017, + 93019-93025, 93027-93047, 93053-93071, 93504-93548, 93552-93561, + 93760-93846, 93856-93880, 93883-93907, 93952-94026, 94031-94087, + 94095-94111, 94176-94177, 94179, 94192-94198, 94208-101589, + 101631-101662, 101760-101874, 110576-110579, 110581-110587, + 110589-110590, 110592-110882, 110898, 110928-110930, 110933, + 110948-110951, 110960-111355, 113664-113770, 113776-113788, + 113792-113800, 113808-113817, 113822, 118000-118009, 119488-119507, + 119520-119539, 119648-119672, 119808-119892, 119894-119964, + 119966-119967, 119970, 119973-119974, 119977-119980, 119982-119993, + 119995, 119997-120003, 120005-120069, 120071-120074, 120077-120084, + 120086-120092, 120094-120121, 120123-120126, 120128-120132, 120134, + 120138-120144, 120146-120485, 120488-120512, 120514-120538, + 120540-120570, 120572-120596, 120598-120628, 120630-120654, + 120656-120686, 120688-120712, 120714-120744, 120746-120770, + 120772-120779, 120782-120831, 122624-122654, 122661-122666, + 122880-122886, 122888-122904, 122907-122913, 122915-122916, + 122918-122922, 122928-122989, 123023, 123136-123180, 123191-123197, + 123200-123209, 123214, 123536-123565, 123584-123627, 123632-123641, + 124112-124139, 124144-124153, 124368-124397, 124400-124410, + 124608-124638, 124640-124661, 124670-124671, 124896-124902, + 124904-124907, 124909-124910, 124912-124926, 124928-125124, + 125127-125135, 125184-125251, 125255, 125259, 125264-125273, + 126065-126123, 126125-126127, 126129-126132, 126209-126253, + 126255-126269, 126464-126467, 126469-126495, 126497-126498, 126500, + 126503, 126505-126514, 126516-126519, 126521, 126523, 126530, 126535, + 126537, 126539, 126541-126543, 126545-126546, 126548, 126551, 126553, + 126555, 126557, 126559, 126561-126562, 126564, 126567-126570, + 126572-126578, 126580-126583, 126585-126588, 126590, 126592-126601, + 126603-126619, 126625-126627, 126629-126633, 126635-126651, + 127232-127244, 127280-127305, 127312-127337, 127344-127369, + 130032-130041, 131072-173791, 173824-178205, 178208-183981, + 183984-191456, 191472-192093, 194560-195101, 196608-201546, + 201552-210041) + (Set 95)) + |}] +;; + +let%expect_test "grouping" = + re "\\(a\\)"; + [%expect {| (Group (Set 97)) |}]; + re "\\(a\\|b\\)c"; + [%expect {| (Sequence (Group (Alternative (Set 97)(Set 98)))(Set 99)) |}] +;; + +let%expect_test "concatenation" = + re "ab"; + [%expect {| (Sequence (Set 97)(Set 98)) |}] +;; + +let%expect_test "ordinary characters" = + re "a"; + [%expect {| (Set 97) |}] +;; diff --git a/lib_test/unicode/expect/test_glob.ml b/lib_test/unicode/expect/test_glob.ml new file mode 100644 index 00000000..336cd324 --- /dev/null +++ b/lib_test/unicode/expect/test_glob.ml @@ -0,0 +1,258 @@ +open Import + +let glob ?match_backslashes ?expand_braces ?anchored ?pathname ?period re s = + let t = Re.Glob.glob ?match_backslashes ?expand_braces ?anchored ?pathname ?period re in + (* Format.printf "[Test_glob.glob] re: '%a'\n" Re.pp t; *) + let re = Re.compile t in + (* Format.printf "[Test_glob.glob] re: '%a'\n" Re.pp_re re; *) + Format.printf "%b@." (Re.execp re s) +;; + +let%expect_test "glob" = + glob "foo*" "foobar"; + [%expect {| true |}]; + glob "fo?bar" "fobar"; + [%expect {| false |}]; + glob "fo?bar" "foobar"; + [%expect {| true |}]; + glob "fo?bar" "foo0bar"; + [%expect {| false |}]; + glob "?oobar" "foobar"; + [%expect {| true |}]; + glob "*bar" "foobar"; + [%expect {| true |}]; + glob "\\*bar" "foobar"; + [%expect {| false |}]; + glob "\\*bar" "*bar"; + [%expect {| true |}]; + glob "[ab]foo" "afoo"; + [%expect {| true |}]; + glob "[ab]foo" "bfoo"; + [%expect {| true |}]; + glob "[ab]foo" "cfoo"; + [%expect {| false |}]; + glob "c[ab]foo" "cabfoo"; + [%expect {| false |}]; + glob ".foo" ".foo"; + [%expect {| true |}]; + glob ".foo" "afoo"; + [%expect {| false |}]; + glob "*[.]foo" "a.foo"; + [%expect {| true |}]; + glob "*[.]foo" "ba.foo"; + [%expect {| true |}]; + glob "*.foo" ".foo"; + [%expect {| false |}]; + glob "*[.]foo" ".foo"; + [%expect {| false |}]; + glob ~anchored:true "*/foo" "/foo"; + [%expect {| true |}]; + glob ~anchored:true "foo/*" "foo/"; + [%expect {| true |}]; + glob "/[^f]" "/foo"; + [%expect {| false |}]; + glob "/[^f]" "/bar"; + [%expect {| true |}]; + glob ~anchored:true "/[^f]" "/bar"; + [%expect {| false |}]; + glob ~anchored:true "*" ".bar"; + [%expect {| false |}]; + glob "foo[.]bar" "foo.bar"; + [%expect {| true |}]; + glob "[.]foo" ".foo"; + [%expect {| false |}]; + glob "foo[/]bar" "foo/bar"; + [%expect {| false |}]; + glob ~anchored:true "*bar" "foobar"; + [%expect {| true |}]; + glob "foo" "foobar"; + [%expect {| true |}]; + glob "bar" "foobar"; + [%expect {| true |}]; + glob ~anchored:true "foo" "foobar"; + [%expect {| false |}]; + glob ~anchored:true "bar" "foobar"; + [%expect {| false |}]; + glob "{foo,bar}bar" "foobar"; + [%expect {| false |}]; + glob "{foo,bar}bar" "{foo,bar}bar"; + [%expect {| true |}]; + glob "foo?bar" "foo/bar"; + [%expect {| false |}]; + let pathname = true in + let period = true in + glob ~pathname ~period "?oobar" ".oobar"; + [%expect {| false |}]; + glob ~pathname ~period "?oobar" "/oobar"; + [%expect {| false |}]; + glob ~pathname ~period "f?obar" "f/obar"; + [%expect {| false |}]; + glob ~pathname ~period "f?obar" "f.obar"; + [%expect {| true |}]; + glob ~pathname ~period "f*.bar" "f.bar"; + [%expect {| true |}]; + glob ~pathname ~period "f?.bar" "fo.bar"; + [%expect {| true |}]; + glob ~pathname ~period "/.bar" "/.bar"; + [%expect {| true |}]; + glob ~pathname ~period "*.bar" ".bar"; + [%expect {| false |}]; + glob ~pathname ~period "?" "."; + [%expect {| false |}]; + glob ~pathname ~period "/*bar" "/.bar"; + [%expect {| false |}]; + glob "?oobar" ".oobar"; + [%expect {| false |}]; + glob "?oobar" "/oobar"; + [%expect {| false |}]; + let pathname = true in + let period = false in + glob ~pathname ~period "?oobar" "/oobar"; + [%expect {| false |}]; + glob ~pathname ~period "?oobar" ".oobar"; + [%expect {| true |}]; + glob ~pathname ~period "f?obar" "f/obar"; + [%expect {| false |}]; + glob ~pathname ~period "f?obar" "f.obar"; + [%expect {| true |}]; + let pathname = false in + let period = false in + glob ~pathname ~period "?oobar" ".oobar"; + [%expect {| true |}]; + glob ~pathname ~period "?oobar" "/oobar"; + [%expect {| true |}]; + glob ~expand_braces:true "{foo,far}bar" "foobar"; + [%expect {| true |}]; + glob ~expand_braces:true "{foo,far}bar" "farbar"; + [%expect {| true |}]; + glob ~expand_braces:true "{foo,far}bar" "{foo,far}bar"; + [%expect {| false |}] +;; + +let%expect_test "double asterisk" = + let glob = glob ~anchored:true in + glob "**" "foobar"; + [%expect {| true |}]; + glob "**" "foo/bar"; + [%expect {| true |}]; + glob "**/bar" "foo/bar"; + [%expect {| true |}]; + glob "**/bar" "foo/far/bar"; + [%expect {| true |}]; + glob "foo/**" "foo"; + [%expect {| false |}]; + glob "foo/**" "foo/bar"; + [%expect {| true |}]; + glob "foo/**" "foo/far/bar"; + [%expect {| true |}]; + glob "foo/**/bar" "foo/far/bar"; + [%expect {| true |}]; + glob "foo/**/bar" "foo/far/oof/bar"; + [%expect {| true |}]; + glob "foo/**bar" "foo/far/oofbar"; + [%expect {| true |}]; + glob "foo/**bar" "foo/bar"; + [%expect {| true |}]; + glob "foo/**bar" "foo/foobar"; + [%expect {| true |}]; + glob "/**" "//foo"; + [%expect {| true |}]; + glob "/**" "/"; + [%expect {| true |}]; + glob "/**" "/x"; + [%expect {| true |}]; + glob "**" "foo//bar"; + [%expect {| true |}]; + glob "foo/bar/**/*.ml" "foo/bar/baz/foobar.ml"; + [%expect {| true |}]; + glob "foo/bar/**/*.ml" "foo/bar/foobar.ml"; + [%expect {| true |}]; + glob "foo/**/bar/**/baz" "foo/bar/baz"; + [%expect {| true |}]; + glob "foo/**/bar/**/baz" "foo/bar/x/y/z/baz"; + [%expect {| true |}]; + glob "foo/**/bar/**/baz" "foo/x/y/z/bar/baz"; + [%expect {| true |}]; + glob "foo/**/bar/**/baz" "foo/bar/x/bar/x/baz"; + [%expect {| true |}]; + glob "foo/**/bar/**/baz" "foo/bar/../x/baz"; + [%expect {| false |}]; + glob "foo/**/bar/**/baz" "foo/bar/./x/baz"; + [%expect {| false |}]; + ((* Interaction with [~period] *) + let glob = glob ~period:true in + glob "**" ".foobar"; + [%expect {| false |}]; + glob "**" ".foo/bar"; + [%expect {| false |}]; + glob "foo/**" "foo/.bar"; + [%expect {| false |}]; + glob "**" "foo/.bar/bat"; + [%expect {| false |}]; + glob "foo/**/bat" "foo/.bar/bat"; + [%expect {| false |}]; + glob "/**/bat" "/foo/.bar/bat"; + [%expect {| false |}]; + glob "/**/bat" "/.bar/bat"; + [%expect {| false |}]; + glob "/**bat" "/bar/.bat"; + [%expect {| false |}]; + glob ".**" ".foobar"; + [%expect {| true |}]; + glob ".**" ".foo/bar"; + [%expect {| true |}]; + glob "foo/.**" "foo/.bar"; + [%expect {| true |}]); + let glob = glob ~period:false in + glob "**" ".foobar"; + [%expect {| true |}]; + glob "**" ".foo/bar"; + [%expect {| true |}]; + glob "foo/**" "foo/.bar"; + [%expect {| true |}]; + glob "**" "foo/.bar/bat"; + [%expect {| true |}]; + glob "foo/**/bat" "foo/.bar/bat"; + [%expect {| true |}]; + glob "/**/bat" "/foo/.bar/bat"; + [%expect {| true |}]; + glob "/**/bat" "/.bar/bat"; + [%expect {| true |}]; + glob "/**bat" "/bar/.bat"; + [%expect {| true |}] +;; + +let%expect_test "backslash handling" = + let anchored = true in + let glob = glob ~anchored in + (let glob = glob ~match_backslashes:false in + glob "a/b/c" "a\\b/c"; + [%expect {| false |}]; + glob "a\\b" "ab"; + [%expect {| true |}]; + glob "a/*.ml" "a/b\\c.ml"; + [%expect {| true |}]; + glob "a/b/*.ml" "a\\b\\c.ml"; + [%expect {| false |}]; + glob "/" "\\"; + [%expect {| false |}]; + glob "/?" "\\a"; + [%expect {| false |}]; + glob "a/**.ml" "a\\c\\.b.ml"; + [%expect {| true |}]); + let glob = glob ~match_backslashes:true in + glob "a/b/c" "a\\b/c"; + [%expect {| true |}]; + glob "a\\b" "ab"; + [%expect {| true |}]; + glob "a/*.ml" "a/b\\c.ml"; + [%expect {| false |}]; + glob "a/b/*.ml" "a\\b\\c.ml"; + [%expect {| true |}]; + glob "/" "\\"; + [%expect {| true |}]; + glob "/?" "\\a"; + [%expect {| true |}]; + glob "a/**.ml" "a\\c\\.b.ml"; + [%expect {| false |}] +;; diff --git a/lib_test/unicode/expect/test_group.ml b/lib_test/unicode/expect/test_group.ml new file mode 100644 index 00000000..efaa6e91 --- /dev/null +++ b/lib_test/unicode/expect/test_group.ml @@ -0,0 +1,144 @@ +open Import +open Re + +let%expect_test "empty group" = + let empty = group empty in + (* Format.printf "[Test_group.empty_group] empty: '%a'@." Re.pp empty; *) + t empty ""; + [%expect {| |}]; + t empty "x"; + [%expect {| |}] +;; + +let%expect_test "zero length group" = + let empty = group bos in + t empty ""; + [%expect {| (Group ( (0 0))( (0 0))) |}]; + t empty "x"; + [%expect {| (Group ( (0 0))( (0 0))) |}] +;; + +let%expect_test "no group" = + let re = any in + t re ""; + [%expect {| |}]; + t re "."; + [%expect {| (Group (. (0 1))) |}] +;; + +let%expect_test "two groups" = + let re = seq [ group any; group any ] in + t re "a"; + [%expect {| |}]; + t re "ab"; + [%expect {| (Group (ab (0 2))(a (0 1))(b (1 2))) |}]; + t re "abc"; + [%expect {| (Group (ab (0 2))(a (0 1))(b (1 2))) |}] +;; + +let%expect_test "maybe group" = + let twoany = seq [ any; any ] in + let re = alt [ twoany; group twoany ] in + t re "aa"; + [%expect {| (Group (aa (0 2))( (-1 -1))) |}]; + t re "a"; + [%expect {| |}] +;; + +let%expect_test "nesting of groups" = + let re = group (seq [ group (char 'a' |> letter); char 'b' |> letter ]) in + t re "ab"; + [%expect {| (Group (ab (0 2))(ab (0 2))(a (0 1))) |}] +;; + +let%expect_test "group choice" = + let t = Import.exec_partial_detailed in + (* Alternation of character sets isn't flattened *) + let lhs_group = + let open Re in + alt [ group (char 'a' |> letter); char 'b' |> letter ] + in + t lhs_group "a"; + [%expect {| `Full [|0,1,"a";0,1,"a"|] |}]; + t lhs_group "b"; + [%expect {| `Full [|0,1,"b";-1,-1,|] |}]; + t + (let open Re in + alt [ group (char 'a' |> letter); group (char 'b' |> letter) ]) + "b"; + [%expect {| `Full [|0,1,"b";-1,-1,;0,1,"b"|] |}]; + (* No_group inside char set: *) + let no_group_charset = + let a = Re.group (Re.char 'a' |> letter) in + let b = Re.char 'b' |> letter in + Re.no_group (Re.alt [ a; b ]) + in + t no_group_charset "a"; + [%expect {| `Full [|0,1,"a"|] |}]; + t no_group_charset "b"; + [%expect {| `Full [|0,1,"b"|] |}]; + (* No_group outside char set *) + let no_group_string = + let aa = Re.group (Re.str "aa") in + let bb = Re.str "bb" in + Re.no_group (Re.alt [ aa; bb ]) + in + t no_group_string "aa"; + [%expect {| `Full [|0,2,"aa"|] |}]; + t no_group_string "bb"; + [%expect {| `Full [|0,2,"bb"|] |}] +;; + +let%expect_test "Group.{get,get_opt,offset,test}" = + let r = seq [ group (char 'a' |> letter); opt (group (char 'a' |> letter)); group (char 'b' |> letter) ] in + let m = exec (compile r) "ab" in + let test idx = + Format.printf "get_opt = %a@." (Fmt.opt Fmt.str) (Group.get_opt m idx); + Format.printf "get = %a@." (or_not_found Fmt.str) (fun () -> Group.get m idx); + Format.printf "test = %b@." (Group.test m idx); + Format.printf "offset = %a@." (or_not_found offset) (fun () -> Group.offset m idx) + in + test 0; + [%expect {| + get_opt = ab + get = ab + test = true + offset = (0, 2) |}]; + test 1; + [%expect {| + get_opt = a + get = a + test = true + offset = (0, 1) |}]; + test 2; + [%expect + {| + get_opt = + get = Not_found + test = false + offset = Not_found |}]; + test 3; + [%expect {| + get_opt = b + get = b + test = true + offset = (1, 2) |}]; + Format.printf "%a@." (array offset) (Group.all_offset m); + [%expect {| [| (0, 2); (0, 1); (-1, -1); (1, 2) |] |}] +;; + +let%expect_test "nest" = + let r = rep (nest (alt [ group (char 'a' |> letter); char 'b' |> letter ])) in + test_re r "ab"; + [%expect {| [| (0, 2); (-1, -1) |] |}]; + test_re r "ba"; + [%expect {| [| (0, 2); (1, 2) |] |}] +;; + +let%expect_test "group/no_group" = + let r = seq [ group (char 'a' |> letter); opt (group (char 'a' |> letter)); group (char 'b' |> letter) ] in + test_re r "ab"; + [%expect {| [| (0, 2); (0, 1); (-1, -1); (1, 2) |] |}]; + test_re (no_group r) "ab"; + [%expect {| [| (0, 2) |] |}] +;; diff --git a/lib_test/unicode/expect/test_hashset.ml b/lib_test/unicode/expect/test_hashset.ml new file mode 100644 index 00000000..3bffd927 --- /dev/null +++ b/lib_test/unicode/expect/test_hashset.ml @@ -0,0 +1,49 @@ +open Import + +let () = Printexc.record_backtrace true + +module Hash_set = Re_private_unicode.Hash_set + +let id1 = 1 +let id2 = 2 +let id3 = 3 + +let test table f = + if f table + then print_endline "[PASS]" + else ( + print_endline "[FAIL]"; + Format.printf "%a@." Hash_set.pp table) +;; + +let%expect_test "basic set" = + let set = Hash_set.create () in + test set Hash_set.is_empty; + [%expect {| [PASS] |}]; + test set (fun set -> not (Hash_set.mem set id1)); + [%expect {| + [PASS] |}] +;; + +let%expect_test "add 1 element" = + let set = Hash_set.create () in + Hash_set.add set id1; + test set (fun set -> not (Hash_set.is_empty set)); + [%expect {| + [PASS] |}]; + test set (fun set -> Hash_set.mem set id1); + [%expect {| + [PASS] |}]; + Hash_set.add set id1; + test set (fun set -> Hash_set.mem set id1); + [%expect {| [PASS] |}]; + Hash_set.add set id2; + test set (fun set -> Hash_set.mem set id2); + [%expect {| [PASS] |}]; + Hash_set.add set id3; + test set (fun set -> Hash_set.mem set id3); + [%expect {| + [PASS] |}]; + test set (fun set -> List.for_all [ id1; id2; id3 ] ~f:(fun id -> Hash_set.mem set id)); + [%expect {| [PASS] |}] +;; diff --git a/lib_test/unicode/expect/test_iter.ml b/lib_test/unicode/expect/test_iter.ml new file mode 100644 index 00000000..213da2ac --- /dev/null +++ b/lib_test/unicode/expect/test_iter.ml @@ -0,0 +1,13 @@ +open Import + +let%expect_test "iter" = + let re = Re.Posix.compile_pat "(ab)+" in + strings (Re.matches re "aabab aaabba dab "); + [%expect {| ["abab"; "ab"; "ab"] |}]; + strings (Re.matches ~pos:2 ~len:7 re "abab ababab"); + [%expect {| ["ab"; "abab"] |}]; + strings (Re.matches re_empty "ab"); + [%expect {| [""; ""; ""] |}]; + strings (Re.matches (Re.compile (Re.rep (Re.(char 'a' |> letter)))) "cat"); + [%expect {| [""; "a"; ""] |}] +;; diff --git a/lib_test/unicode/expect/test_mark.ml b/lib_test/unicode/expect/test_mark.ml new file mode 100644 index 00000000..3db37ad2 --- /dev/null +++ b/lib_test/unicode/expect/test_mark.ml @@ -0,0 +1,53 @@ +open Import +open Re + +let test_mark ?pos ?len r s il1 il2 = + let subs = exec ?pos ?len (compile r) s in + Format.printf + "%b@." + (List.for_all ~f:(Mark.test subs) il1 + && List.for_all ~f:(fun x -> not (Mark.test subs x)) il2) +;; + +let%expect_test "mark" = + let i, r = mark digit in + test_mark r "0" [ i ] []; + [%expect {| true |}] +;; + +let%expect_test "mark seq" = + let i, r = mark digit in + let r = seq [ r; r ] in + test_mark r "02" [ i ] []; + [%expect {| true |}] +;; + +let%expect_test "mark rep" = + let i, r = mark digit in + let r = rep r in + test_mark r "02" [ i ] []; + [%expect {| true |}] +;; + +let%expect_test "mark alt" = + let ia, ra = mark (char 'a' |> letter) in + let ib, rb = mark (char 'b' |> letter) in + let r = alt [ ra; rb ] in + test_mark r "a" [ ia ] [ ib ]; + test_mark r "b" [ ib ] [ ia ]; + [%expect {| + true + true |}]; + let r = rep r in + test_mark r "ab" [ ia; ib ] []; + [%expect {| true |}] +;; + +let%expect_test "mark prefers lhs" = + let two_chars = seq [ any; any ] in + let lhs, x = mark two_chars in + let rhs, x' = mark two_chars in + let r = alt [ x; x' ] in + test_mark r "aa" [ lhs ] [ rhs ]; + [%expect {| true |}] +;; diff --git a/lib_test/unicode/expect/test_partial.ml b/lib_test/unicode/expect/test_partial.ml new file mode 100644 index 00000000..d3ff8f9a --- /dev/null +++ b/lib_test/unicode/expect/test_partial.ml @@ -0,0 +1,69 @@ +open Import + +let t re s = + let re = Re.compile re in + let res = Re.exec_partial re s in + Format.printf + "`%s@." + (match res with + | `Partial -> "Partial" + | `Full -> "Full" + | `Mismatch -> "Mismatch") +;; + +let%expect_test "partial matches" = + let open Re in + t (str "hello") "he"; + [%expect {| `Partial |}]; + t (str "hello") "goodbye"; + [%expect {| `Partial |}]; + (* exec_partial 3 should be `Full *) + t (str "hello") "hello"; + [%expect {| `Partial |}]; + t (whole_string (str "hello")) "hello"; + [%expect {| `Partial |}]; + t (whole_string (str "hello")) "goodbye"; + [%expect {| `Mismatch |}]; + t (str "hello") ""; + [%expect {| `Partial |}]; + t (str "") "hello"; + [%expect {| `Full |}]; + t (whole_string (str "hello")) ""; + [%expect {| `Partial |}] +;; + +let t = exec_partial_detailed + +let%expect_test "partial detailed" = + let open Re in + t (str "hello") "he"; + [%expect {| `Partial 0 |}]; + (* Because of how the matching engine currently works, situations where + the entirety of the input string cannot be a match like the test below + actually return the last character as a potential start instead of just + return `Partial (String.length input). This is still fine however as + it still respects the mli contract, as no match could start before + the given position, and is fine in practice as testing an extra + character on extra input doesn't add much more in terms of workload. + *) + t (str "hello") "goodbye"; + [%expect {| `Partial 6 |}]; + t (str "hello") "hello"; + [%expect {| `Full [|0,5,"hello"|] |}]; + t (whole_string (str "hello")) "hello"; + [%expect {| `Full [|0,5,"hello"|] |}]; + t (whole_string (str "hello")) "goodbye"; + [%expect {| `Mismatch |}]; + t (str "hello") ""; + [%expect {| `Partial 0 |}]; + t (str "") "hello"; + [%expect {| `Full [|0,0,""|] |}]; + t (whole_string (str "hello")) ""; + [%expect {| `Partial 0 |}]; + t (str "abc") ".ab.ab"; + [%expect {| `Partial 4 |}]; + t ~pos:1 (seq [ not_boundary; str "b" ]) "ab"; + [%expect {| `Full [|1,2,"b"|] |}]; + t (seq [ group (str "a"); rep any; group (str "b") ]) ".acb."; + [%expect {| `Full [|1,4,"acb";1,2,"a";3,4,"b"|] |}] +;; diff --git a/lib_test/unicode/expect/test_pcre.ml b/lib_test/unicode/expect/test_pcre.ml new file mode 100644 index 00000000..a4e11a9b --- /dev/null +++ b/lib_test/unicode/expect/test_pcre.ml @@ -0,0 +1,100 @@ +open Import +module Pcre = Re.Pcre + +let test re s = + try Pcre.re re |> fun re -> t re s + with _ -> Format.printf "failed to parse@." + +let%expect_test "quoted strings" = + test {|\Qfoo\E|} "foo"; + [%expect {| (Group (foo (0 3))) |}]; + test {|\Qbar|} ""; + [%expect {| failed to parse |}]; + test {|\Qbaz\|} ""; + [%expect {| failed to parse |}]; + test {|\Qba\Xz\E|} {|ba\Xz|}; + [%expect {| (Group (ba\Xz (0 5))) |}] + +let%expect_test "octal" = + test {|\025|} (string_make_of_char '\o025'); + [%expect {| (Group ( (0 1))) |}]; + test {|\999|} ""; + [%expect {| failed to parse |}]; + test {|\111|} (string_make_of_char '\o111'); + [%expect {| (Group (I (0 1))) |}] + +let%expect_test "\\x and \\o form" = + test {|\o{111}|} (string_make_of_char '\o111'); + [%expect {| |}]; + test {|\o{111|} ""; + [%expect {| failed to parse |}]; + test {|\x{ff}|} (string_make_of_char '\xff'); + [%expect {| (Group (ÿ (0 2))) |}]; + test {|\x{ff|} ""; + [%expect {| failed to parse |}] + +let%expect_test "substitute" = + let open Pcre in + let substitute ~rex ~subst s = substitute ~rex ~subst s |> print_endline in + let rex = regexp "[a-zA-Z]+" in + (* adag: only ASCII characters so it's ok with Utf8.*) + let subst = String.capitalize_ascii in + substitute ~rex ~subst " hello world; I love chips!"; + [%expect {| Hello World; I Love Chips! |}]; + substitute ~rex:re_empty ~subst:(fun _ -> "a") ""; + [%expect {| a |}]; + substitute ~rex:(regexp "a*") ~subst:(fun _ -> "*") "cat"; + [%expect {| *c*t* |}]; + let rex = regexp "^ *" in + substitute ~rex ~subst:(fun _ -> "A ") "test"; + [%expect {| A test |}] + +let%expect_test "test_blank_class" = + let re = Re.Perl.compile_pat "\\d[[:blank:]]\\d[[:blank:]]+[a-z]" in + let successes = [ "1 2 a"; "2\t3 z"; "9\t0 \t a" ] in + let failures = [ ""; "123"; " "; "1 3z" ] in + List.iter successes ~f:(fun s -> + printf "String %S should match %b\n" s (Re.execp re s)); + [%expect + {| + String "1 2 a" should match true + String "2\t3 z" should match true + String "9\t0 \t a" should match true |}]; + List.iter failures ~f:(fun s -> + printf "String %S should not match %b\n" s (Re.execp re s)); + [%expect + {| + String "" should not match false + String "123" should not match false + String " " should not match false + String "1 3z" should not match false |}] + +let%expect_test "named groups" = + let open Pcre in + let rex = regexp "(?x+)" in + let s = exec ~rex "testxxxyyy" in + print_endline (get_named_substring rex "many_x" s); + [%expect {| xxx |}] + +let%expect_test "quote" = + let test s = Printf.printf "%S\n" (Re.Pcre.quote s) in + test ""; + [%expect {| "" |}]; + test "\000"; + [%expect {| "\000" |}]; + test ""; + [%expect {| "" |}]; + test (String.init (126 - 32) (fun x -> Char.chr (x + 32))); + [%expect + {xxx| " !\"#\\$%&'\\(\\)\\*\\+,-\\./0123456789:;<=>\\?@ABCDEFGHIJKLMNOPQRSTUVWXYZ\\[\\\\]\\^_`abcdefghijklmnopqrstuvwxyz\\{\\|}" |xxx}]; + let b = Buffer.create 100 in + for i = 0 to 255 do + let letter = Cset.CodePage.of_char @@ Char.chr i in + let len = Cset.Codec.byte_length letter in + let bytes = Cset.Codec.to_bytes letter in + let s = Pcre.quote (Bytes.to_string bytes) in + if String.length s > len then Buffer.add_bytes b bytes + done; + let b = Buffer.contents b in + Printf.printf "%S\n" b; + [%expect {xxx| "$()*+.?[\\^{|" |xxx}] diff --git a/lib_test/unicode/expect/test_pcre_288.ml b/lib_test/unicode/expect/test_pcre_288.ml new file mode 100644 index 00000000..79bf1480 --- /dev/null +++ b/lib_test/unicode/expect/test_pcre_288.ml @@ -0,0 +1,16 @@ +open Import + +module Pcre = Re.Pcre + +let whitespace_re = Pcre.regexp "\\s+" + +let%expect_test "split1" = + strings (Pcre.split ~rex:whitespace_re ""); + [%expect {| [] |}] +;; + +let%expect_test "split2" = + strings (Pcre.split ~rex:whitespace_re " "); + [%expect {| + [] |}] +;; diff --git a/lib_test/unicode/expect/test_pcre_split.ml b/lib_test/unicode/expect/test_pcre_split.ml new file mode 100644 index 00000000..2a468d28 --- /dev/null +++ b/lib_test/unicode/expect/test_pcre_split.ml @@ -0,0 +1,48 @@ +open Import + +let split ~rex s = Re.Pcre.split ~rex s |> strings + +let%expect_test "split" = + split ~rex:re_whitespace "aa bb c d "; + [%expect {| ["aa"; "bb"; "c"; "d"] |}]; + split ~rex:re_whitespace " a full_word bc "; + [%expect {| ["a"; "full_word"; "bc"] |}]; + split ~rex:re_empty "abcd"; + [%expect {| ["a"; "b"; "c"; "d"] |}]; + split ~rex:re_eol "a\nb"; + [%expect {| ["a"; "\nb"] |}]; + split ~rex:re_bow "a b"; + [%expect {| ["a "; "b"] |}]; + split ~rex:re_eow "a b"; + [%expect {| ["a"; " b"] |}]; + let rex = Re.Pcre.regexp "" in + split ~rex "xx"; + [%expect {| ["x"; "x"] |}] +;; + +let full_split ?max ~rex s = + let res = Re.Pcre.full_split ?max ~rex s in + Format.printf + "[%a]@." + Fmt.( + list ~pp_sep:(Fmt.lit "; ") (fun fmt what -> + match (what : Re_private_unicode.Pcre.split_result) with + | Text s -> Format.fprintf fmt "Text %S" s + | Delim s -> Format.fprintf fmt "Delim %S" s + | NoGroup -> Format.fprintf fmt "NoGroup" + | Group (x, s) -> Format.fprintf fmt "Group (%d, %S)" x s)) + res +;; + +let%expect_test "full split" = + (let full_split = full_split ~rex:(Re.Pcre.regexp "x(x)?") in + full_split "testxxyyy"; + [%expect {| [Text "test"; Delim "xx"; Group (1, "x"); Text "yyy"] |}]; + full_split "testxyyy"; + [%expect {| [Text "test"; Delim "x"; NoGroup; Text "yyy"] |}]); + let full_split = full_split ~rex:(Re.Pcre.regexp "[:_]") in + full_split ""; + [%expect {| [] |}]; + full_split ~max:1 "xxx:yyy"; + [%expect {| [Text "xxx:yyy"] |}] +;; diff --git a/lib_test/unicode/expect/test_perl.ml b/lib_test/unicode/expect/test_perl.ml new file mode 100644 index 00000000..668bb078 --- /dev/null +++ b/lib_test/unicode/expect/test_perl.ml @@ -0,0 +1,553 @@ +open Import + +(* Tests based on description of Perl regular expressions given at + http://www.perl.com/CPAN-local/doc/manual/html/pod/perlre.html *) + +let re ?opts s = Format.printf "%a@." Re.pp (Re.Perl.re ?opts s) + +let try_parse ?opts s = + try + ignore (Re.Perl.re ?opts s); + print_endline "Prased successfully" + with + | Re_private_unicode.Perl.Parse_error -> print_endline "Parse error" + | Re_private_unicode.Perl.Not_supported -> print_endline "Not supported" + | Re_private_unicode.Uucodecs.CodecError -> print_endline "Codec error" + | Re_private_unicode.Uucodecs.End_of_data -> print_endline "End of data" +;; + +let%expect_test "escaping meta characters" = + re "\\^"; + [%expect {| (Set 94) |}]; + re "\\."; + [%expect {| (Set 46) |}]; + re "\\$"; + [%expect {| (Set 36) |}]; + re "\\|"; + [%expect {| (Set 124) |}]; + re "\\("; + [%expect {| (Set 40) |}]; + re "\\)"; + [%expect {| (Set 41) |}]; + re "\\["; + [%expect {| (Set 91) |}]; + re "\\]"; + [%expect {| (Set 93) |}]; + re "\\*"; + [%expect {| (Set 42) |}]; + re "\\+"; + [%expect {| (Set 43) |}]; + re "\\?"; + [%expect {| (Set 63) |}]; + re "\\\\"; + [%expect {| (Set 92) |}] +;; + +let%expect_test "basic metacharacters" = + re "^"; + [%expect {| Beg_of_str |}]; + re "."; + [%expect {| (Set 0-9, 14-132, 134-8231, 8234-55295, 57344-1114111) |}]; + re "$"; + [%expect {| End_of_str |}]; + re "a|b"; + [%expect {| (Alternative (Set 97)(Set 98)) |}]; + re "aa|bb"; + [%expect {| (Alternative (Sequence (Set 97)(Set 97))(Sequence (Set 98)(Set 98))) |}]; + re "(a)"; + [%expect {| (Group (Set 97)) |}]; + re "(a|b)c"; + [%expect {| (Sequence (Group (Alternative (Set 97)(Set 98)))(Set 99)) |}]; + re "[ab]"; + [%expect {| (Alternative (Set 98)(Set 97)) |}]; + re "[a-z]"; + [%expect {| (Set 97-122) |}]; + re "[a-z$%.]"; + [%expect {| (Alternative (Set 46)(Set 37)(Set 36)(Set 97-122)) |}]; + re "[-az]"; + [%expect {| (Alternative (Set 122)(Set 97)(Set 45)) |}]; + re "[az-]"; + [%expect {| (Alternative (Set 122)(Set 45)(Set 97)) |}]; + re "[a\\-z]"; + [%expect {| (Alternative (Set 122)(Set 45)(Set 97)) |}]; + re "[]a]"; + [%expect {| (Alternative (Set 97)(Set 93)) |}]; + re "[]-]"; + [%expect {| (Alternative (Set 93)(Set 45)) |}]; + re "[a^]"; + [%expect {| (Alternative (Set 94)(Set 97)) |}]; + re "[^a-z]"; + [%expect {| (Complement (Set 97-122)) |}]; + re "[^a-z$]"; + [%expect {| (Complement (Set 36)(Set 97-122)) |}]; + re "[a-\\sz]"; + [%expect {| + (Alternative (Set 122)(Set 97)(Set 45) + (Set 9-13, 32, 133, 160, 5760, 8192-8202, 8232-8233, 8239, 8287, 12288)) + |}] +;; + +let%expect_test "greedy quantifiers" = + re "a*"; + [%expect {| (Sem_greedy Greedy (Repeat (Set 97) 0)) |}]; + re "a+"; + [%expect {| (Sem_greedy Greedy (Repeat (Set 97) 1)) |}]; + re "a?"; + [%expect {| (Sem_greedy Greedy (Repeat (Set 97) 0 1)) |}]; + re "a{10}"; + [%expect {| (Sem_greedy Greedy (Repeat (Set 97) 10 10)) |}]; + re "a{10,}"; + [%expect {| (Sem_greedy Greedy (Repeat (Set 97) 10)) |}]; + re "a{10,12}"; + [%expect {| (Sem_greedy Greedy (Repeat (Set 97) 10 12)) |}] +;; + +let%expect_test "non-greedy quantifiers" = + re "a*?"; + [%expect {| (Sem_greedy Non_greedy (Repeat (Set 97) 0)) |}]; + re "a+?"; + [%expect {| (Sem_greedy Non_greedy (Repeat (Set 97) 1)) |}]; + re "a??"; + [%expect {| (Sem_greedy Non_greedy (Repeat (Set 97) 0 1)) |}]; + re "a{10}?"; + [%expect {| (Sem_greedy Non_greedy (Repeat (Set 97) 10 10)) |}]; + re "a{10,}?"; + [%expect {| (Sem_greedy Non_greedy (Repeat (Set 97) 10)) |}]; + re "a{10,12}?"; + [%expect {| (Sem_greedy Non_greedy (Repeat (Set 97) 10 12)) |}] +;; + +let%expect_test "character sets" = + re "\\w"; + [%expect + {| + (Alternative + (Set 48-57, 65-90, 97-122, 170, 178-179, 181, 185-186, 188-190, 192-214, + 216-246, 248-705, 710-721, 736-740, 748, 750, 837, 867-884, 886-887, + 890-893, 895, 902, 904-906, 908, 910-929, 931-1013, 1015-1153, + 1162-1327, 1329-1366, 1369, 1376-1416, 1456-1469, 1471, 1473-1474, + 1476-1477, 1479, 1488-1514, 1519-1522, 1552-1562, 1568-1623, 1625-1641, + 1646-1747, 1749-1756, 1761-1768, 1773-1788, 1791, 1808-1855, 1869-1969, + 1984-2026, 2036-2037, 2042, 2048-2071, 2074-2092, 2112-2136, 2144-2154, + 2160-2183, 2185-2191, 2199, 2208-2249, 2260-2271, 2275-2281, 2288-2363, + 2365-2380, 2382-2384, 2389-2403, 2406-2415, 2417-2435, 2437-2444, + 2447-2448, 2451-2472, 2474-2480, 2482, 2486-2489, 2493-2500, 2503-2504, + 2507-2508, 2510, 2519, 2524-2525, 2527-2531, 2534-2545, 2548-2553, + 2556, 2561-2563, 2565-2570, 2575-2576, 2579-2600, 2602-2608, 2610-2611, + 2613-2614, 2616-2617, 2622-2626, 2631-2632, 2635-2636, 2641, 2649-2652, + 2654, 2662-2677, 2689-2691, 2693-2701, 2703-2705, 2707-2728, 2730-2736, + 2738-2739, 2741-2745, 2749-2757, 2759-2761, 2763-2764, 2768, 2784-2787, + 2790-2799, 2809-2812, 2817-2819, 2821-2828, 2831-2832, 2835-2856, + 2858-2864, 2866-2867, 2869-2873, 2877-2884, 2887-2888, 2891-2892, + 2902-2903, 2908-2909, 2911-2915, 2918-2927, 2929-2935, 2946-2947, + 2949-2954, 2958-2960, 2962-2965, 2969-2970, 2972, 2974-2975, 2979-2980, + 2984-2986, 2990-3001, 3006-3010, 3014-3016, 3018-3020, 3024, 3031, + 3046-3058, 3072-3084, 3086-3088, 3090-3112, 3114-3129, 3133-3140, + 3142-3144, 3146-3148, 3157-3158, 3160-3162, 3164-3165, 3168-3171, + 3174-3183, 3192-3198, 3200-3203, 3205-3212, 3214-3216, 3218-3240, + 3242-3251, 3253-3257, 3261-3268, 3270-3272, 3274-3276, 3285-3286, + 3292-3294, 3296-3299, 3302-3311, 3313-3315, 3328-3340, 3342-3344, + 3346-3386, 3389-3396, 3398-3400, 3402-3404, 3406, 3412-3427, 3430-3448, + 3450-3455, 3457-3459, 3461-3478, 3482-3505, 3507-3515, 3517, 3520-3526, + 3535-3540, 3542, 3544-3551, 3558-3567, 3570-3571, 3585-3642, 3648-3654, + 3661, 3664-3673, 3713-3714, 3716, 3718-3722, 3724-3747, 3749, + 3751-3769, 3771-3773, 3776-3780, 3782, 3789, 3792-3801, 3804-3807, + 3840, 3872-3891, 3904-3911, 3913-3948, 3953-3971, 3976-3991, 3993-4028, + 4096-4150, 4152, 4155-4169, 4176-4253, 4256-4293, 4295, 4301, + 4304-4346, 4348-4680, 4682-4685, 4688-4694, 4696, 4698-4701, 4704-4744, + 4746-4749, 4752-4784, 4786-4789, 4792-4798, 4800, 4802-4805, 4808-4822, + 4824-4880, 4882-4885, 4888-4954, 4969-4988, 4992-5007, 5024-5109, + 5112-5117, 5121-5740, 5743-5759, 5761-5786, 5792-5866, 5870-5880, + 5888-5907, 5919-5939, 5952-5971, 5984-5996, 5998-6000, 6002-6003, + 6016-6067, 6070-6088, 6103, 6108, 6112-6121, 6128-6137, 6160-6169, + 6176-6264, 6272-6314, 6320-6389, 6400-6430, 6432-6443, 6448-6456, + 6470-6509, 6512-6516, 6528-6571, 6576-6601, 6608-6618, 6656-6683, + 6688-6750, 6753-6772, 6784-6793, 6800-6809, 6823, 6847-6848, 6860-6862, + 6912-6963, 6965-6979, 6981-6988, 6992-7001, 7040-7081, 7084-7141, + 7143-7153, 7168-7222, 7232-7241, 7245-7293, 7296-7306, 7312-7354, + 7357-7359, 7401-7404, 7406-7411, 7413-7414, 7418, 7424-7615, 7635-7668, + 7680-7957, 7960-7965, 7968-8005, 8008-8013, 8016-8023, 8025, 8027, + 8029, 8031-8061, 8064-8116, 8118-8124, 8126, 8130-8132, 8134-8140, + 8144-8147, 8150-8155, 8160-8172, 8178-8180, 8182-8188, 8304-8305, + 8308-8313, 8319-8329, 8336-8348, 8450, 8455, 8458-8467, 8469, + 8473-8477, 8484, 8486, 8488, 8490-8493, 8495-8505, 8508-8511, + 8517-8521, 8526, 8528-8585, 9312-9371, 9398-9471, 10102-10131, + 11264-11492, 11499-11502, 11506-11507, 11517, 11520-11557, 11559, + 11565, 11568-11623, 11631, 11648-11670, 11680-11686, 11688-11694, + 11696-11702, 11704-11710, 11712-11718, 11720-11726, 11728-11734, + 11736-11742, 11744-11775, 11823, 12293-12295, 12321-12329, 12337-12341, + 12344-12348, 12353-12438, 12445-12447, 12449-12538, 12540-12543, + 12549-12591, 12593-12686, 12690-12693, 12704-12735, 12784-12799, + 12832-12841, 12872-12879, 12881-12895, 12928-12937, 12977-12991, + 13312-19903, 19968-42124, 42192-42237, 42240-42508, 42512-42539, + 42560-42606, 42612-42619, 42623-42735, 42775-42783, 42786-42888, + 42891-42972, 42993-43013, 43015-43047, 43056-43061, 43072-43123, + 43136-43203, 43205, 43216-43225, 43250-43255, 43259, 43261-43306, + 43312-43346, 43360-43388, 43392-43442, 43444-43455, 43471-43481, + 43488-43518, 43520-43574, 43584-43597, 43600-43609, 43616-43638, + 43642-43710, 43712, 43714, 43739-43741, 43744-43759, 43762-43765, + 43777-43782, 43785-43790, 43793-43798, 43808-43814, 43816-43822, + 43824-43866, 43868-43881, 43888-44010, 44016-44025, 44032-55203, + 55216-55238, 55243-55291, 63744-64109, 64112-64217, 64256-64262, + 64275-64279, 64285-64296, 64298-64310, 64312-64316, 64318, 64320-64321, + 64323-64324, 64326-64433, 64467-64829, 64848-64911, 64914-64967, + 65008-65019, 65136-65140, 65142-65276, 65296-65305, 65313-65338, + 65345-65370, 65382-65470, 65474-65479, 65482-65487, 65490-65495, + 65498-65500, 65536-65547, 65549-65574, 65576-65594, 65596-65597, + 65599-65613, 65616-65629, 65664-65786, 65799-65843, 65856-65912, + 65930-65931, 66176-66204, 66208-66256, 66273-66299, 66304-66339, + 66349-66378, 66384-66426, 66432-66461, 66464-66499, 66504-66511, + 66513-66517, 66560-66717, 66720-66729, 66736-66771, 66776-66811, + 66816-66855, 66864-66915, 66928-66938, 66940-66954, 66956-66962, + 66964-66965, 66967-66977, 66979-66993, 66995-67001, 67003-67004, + 67008-67059, 67072-67382, 67392-67413, 67424-67431, 67456-67461, + 67463-67504, 67506-67514, 67584-67589, 67592, 67594-67637, 67639-67640, + 67644, 67647-67669, 67672-67702, 67705-67742, 67751-67759, 67808-67826, + 67828-67829, 67835-67867, 67872-67897, 67904-67929, 67968-68023, + 68028-68047, 68050-68099, 68101-68102, 68108-68115, 68117-68119, + 68121-68149, 68160-68168, 68192-68222, 68224-68255, 68288-68295, + 68297-68324, 68331-68335, 68352-68405, 68416-68437, 68440-68466, + 68472-68497, 68521-68527, 68608-68680, 68736-68786, 68800-68850, + 68858-68903, 68912-68921, 68928-68965, 68969, 68975-68997, 69216-69246, + 69248-69289, 69291-69292, 69296-69297, 69314-69319, 69370-69372, + 69376-69415, 69424-69445, 69457-69460, 69488-69505, 69552-69579, + 69600-69622, 69632-69701, 69714-69743, 69745-69749, 69760-69816, 69826, + 69840-69864, 69872-69881, 69888-69938, 69942-69951, 69956-69959, + 69968-70002, 70006, 70016-70079, 70081-70084, 70094-70106, 70108, + 70113-70132, 70144-70161, 70163-70196, 70199, 70206-70209, 70272-70278, + 70280, 70282-70285, 70287-70301, 70303-70312, 70320-70376, 70384-70393, + 70400-70403, 70405-70412, 70415-70416, 70419-70440, 70442-70448, + 70450-70451, 70453-70457, 70461-70468, 70471-70472, 70475-70476, 70480, + 70487, 70493-70499, 70528-70537, 70539, 70542, 70544-70581, + 70583-70592, 70594, 70597, 70599-70602, 70604-70605, 70609, 70611, + 70656-70721, 70723-70725, 70727-70730, 70736-70745, 70751-70753, + 70784-70849, 70852-70853, 70855, 70864-70873, 71040-71093, 71096-71102, + 71128-71133, 71168-71230, 71232, 71236, 71248-71257, 71296-71349, + 71352, 71360-71369, 71376-71395, 71424-71450, 71453-71466, 71472-71483, + 71488-71494, 71680-71736, 71840-71922, 71935-71942, 71945, 71948-71955, + 71957-71958, 71960-71989, 71991-71992, 71995-71996, 71999-72002, + 72016-72025, 72096-72103, 72106-72151, 72154-72159, 72161, 72163-72164, + 72192-72242, 72245-72254, 72272-72343, 72349, 72368-72440, 72544-72551, + 72640-72672, 72688-72697, 72704-72712, 72714-72758, 72760-72766, 72768, + 72784-72812, 72818-72847, 72850-72871, 72873-72886, 72960-72966, + 72968-72969, 72971-73014, 73018, 73020-73021, 73023-73025, 73027, + 73030-73031, 73040-73049, 73056-73061, 73063-73064, 73066-73102, + 73104-73105, 73107-73110, 73112, 73120-73129, 73136-73179, 73184-73193, + 73440-73462, 73472-73488, 73490-73530, 73534-73536, 73552-73561, 73648, + 73664-73684, 73728-74649, 74752-74862, 74880-75075, 77712-77808, + 77824-78895, 78913-78918, 78944-82938, 82944-83526, 90368-90414, + 90416-90425, 92160-92728, 92736-92766, 92768-92777, 92784-92862, + 92864-92873, 92880-92909, 92928-92975, 92992-92995, 93008-93017, + 93019-93025, 93027-93047, 93053-93071, 93504-93548, 93552-93561, + 93760-93846, 93856-93880, 93883-93907, 93952-94026, 94031-94087, + 94095-94111, 94176-94177, 94179, 94192-94198, 94208-101589, + 101631-101662, 101760-101874, 110576-110579, 110581-110587, + 110589-110590, 110592-110882, 110898, 110928-110930, 110933, + 110948-110951, 110960-111355, 113664-113770, 113776-113788, + 113792-113800, 113808-113817, 113822, 118000-118009, 119488-119507, + 119520-119539, 119648-119672, 119808-119892, 119894-119964, + 119966-119967, 119970, 119973-119974, 119977-119980, 119982-119993, + 119995, 119997-120003, 120005-120069, 120071-120074, 120077-120084, + 120086-120092, 120094-120121, 120123-120126, 120128-120132, 120134, + 120138-120144, 120146-120485, 120488-120512, 120514-120538, + 120540-120570, 120572-120596, 120598-120628, 120630-120654, + 120656-120686, 120688-120712, 120714-120744, 120746-120770, + 120772-120779, 120782-120831, 122624-122654, 122661-122666, + 122880-122886, 122888-122904, 122907-122913, 122915-122916, + 122918-122922, 122928-122989, 123023, 123136-123180, 123191-123197, + 123200-123209, 123214, 123536-123565, 123584-123627, 123632-123641, + 124112-124139, 124144-124153, 124368-124397, 124400-124410, + 124608-124638, 124640-124661, 124670-124671, 124896-124902, + 124904-124907, 124909-124910, 124912-124926, 124928-125124, + 125127-125135, 125184-125251, 125255, 125259, 125264-125273, + 126065-126123, 126125-126127, 126129-126132, 126209-126253, + 126255-126269, 126464-126467, 126469-126495, 126497-126498, 126500, + 126503, 126505-126514, 126516-126519, 126521, 126523, 126530, 126535, + 126537, 126539, 126541-126543, 126545-126546, 126548, 126551, 126553, + 126555, 126557, 126559, 126561-126562, 126564, 126567-126570, + 126572-126578, 126580-126583, 126585-126588, 126590, 126592-126601, + 126603-126619, 126625-126627, 126629-126633, 126635-126651, + 127232-127244, 127280-127305, 127312-127337, 127344-127369, + 130032-130041, 131072-173791, 173824-178205, 178208-183981, + 183984-191456, 191472-192093, 194560-195101, 196608-201546, + 201552-210041) + (Set 95)) + |}]; + re "\\W"; + [%expect + {| + (Complement + (Set 48-57, 65-90, 97-122, 170, 178-179, 181, 185-186, 188-190, 192-214, + 216-246, 248-705, 710-721, 736-740, 748, 750, 837, 867-884, 886-887, + 890-893, 895, 902, 904-906, 908, 910-929, 931-1013, 1015-1153, + 1162-1327, 1329-1366, 1369, 1376-1416, 1456-1469, 1471, 1473-1474, + 1476-1477, 1479, 1488-1514, 1519-1522, 1552-1562, 1568-1623, 1625-1641, + 1646-1747, 1749-1756, 1761-1768, 1773-1788, 1791, 1808-1855, 1869-1969, + 1984-2026, 2036-2037, 2042, 2048-2071, 2074-2092, 2112-2136, 2144-2154, + 2160-2183, 2185-2191, 2199, 2208-2249, 2260-2271, 2275-2281, 2288-2363, + 2365-2380, 2382-2384, 2389-2403, 2406-2415, 2417-2435, 2437-2444, + 2447-2448, 2451-2472, 2474-2480, 2482, 2486-2489, 2493-2500, 2503-2504, + 2507-2508, 2510, 2519, 2524-2525, 2527-2531, 2534-2545, 2548-2553, + 2556, 2561-2563, 2565-2570, 2575-2576, 2579-2600, 2602-2608, 2610-2611, + 2613-2614, 2616-2617, 2622-2626, 2631-2632, 2635-2636, 2641, 2649-2652, + 2654, 2662-2677, 2689-2691, 2693-2701, 2703-2705, 2707-2728, 2730-2736, + 2738-2739, 2741-2745, 2749-2757, 2759-2761, 2763-2764, 2768, 2784-2787, + 2790-2799, 2809-2812, 2817-2819, 2821-2828, 2831-2832, 2835-2856, + 2858-2864, 2866-2867, 2869-2873, 2877-2884, 2887-2888, 2891-2892, + 2902-2903, 2908-2909, 2911-2915, 2918-2927, 2929-2935, 2946-2947, + 2949-2954, 2958-2960, 2962-2965, 2969-2970, 2972, 2974-2975, 2979-2980, + 2984-2986, 2990-3001, 3006-3010, 3014-3016, 3018-3020, 3024, 3031, + 3046-3058, 3072-3084, 3086-3088, 3090-3112, 3114-3129, 3133-3140, + 3142-3144, 3146-3148, 3157-3158, 3160-3162, 3164-3165, 3168-3171, + 3174-3183, 3192-3198, 3200-3203, 3205-3212, 3214-3216, 3218-3240, + 3242-3251, 3253-3257, 3261-3268, 3270-3272, 3274-3276, 3285-3286, + 3292-3294, 3296-3299, 3302-3311, 3313-3315, 3328-3340, 3342-3344, + 3346-3386, 3389-3396, 3398-3400, 3402-3404, 3406, 3412-3427, 3430-3448, + 3450-3455, 3457-3459, 3461-3478, 3482-3505, 3507-3515, 3517, 3520-3526, + 3535-3540, 3542, 3544-3551, 3558-3567, 3570-3571, 3585-3642, 3648-3654, + 3661, 3664-3673, 3713-3714, 3716, 3718-3722, 3724-3747, 3749, + 3751-3769, 3771-3773, 3776-3780, 3782, 3789, 3792-3801, 3804-3807, + 3840, 3872-3891, 3904-3911, 3913-3948, 3953-3971, 3976-3991, 3993-4028, + 4096-4150, 4152, 4155-4169, 4176-4253, 4256-4293, 4295, 4301, + 4304-4346, 4348-4680, 4682-4685, 4688-4694, 4696, 4698-4701, 4704-4744, + 4746-4749, 4752-4784, 4786-4789, 4792-4798, 4800, 4802-4805, 4808-4822, + 4824-4880, 4882-4885, 4888-4954, 4969-4988, 4992-5007, 5024-5109, + 5112-5117, 5121-5740, 5743-5759, 5761-5786, 5792-5866, 5870-5880, + 5888-5907, 5919-5939, 5952-5971, 5984-5996, 5998-6000, 6002-6003, + 6016-6067, 6070-6088, 6103, 6108, 6112-6121, 6128-6137, 6160-6169, + 6176-6264, 6272-6314, 6320-6389, 6400-6430, 6432-6443, 6448-6456, + 6470-6509, 6512-6516, 6528-6571, 6576-6601, 6608-6618, 6656-6683, + 6688-6750, 6753-6772, 6784-6793, 6800-6809, 6823, 6847-6848, 6860-6862, + 6912-6963, 6965-6979, 6981-6988, 6992-7001, 7040-7081, 7084-7141, + 7143-7153, 7168-7222, 7232-7241, 7245-7293, 7296-7306, 7312-7354, + 7357-7359, 7401-7404, 7406-7411, 7413-7414, 7418, 7424-7615, 7635-7668, + 7680-7957, 7960-7965, 7968-8005, 8008-8013, 8016-8023, 8025, 8027, + 8029, 8031-8061, 8064-8116, 8118-8124, 8126, 8130-8132, 8134-8140, + 8144-8147, 8150-8155, 8160-8172, 8178-8180, 8182-8188, 8304-8305, + 8308-8313, 8319-8329, 8336-8348, 8450, 8455, 8458-8467, 8469, + 8473-8477, 8484, 8486, 8488, 8490-8493, 8495-8505, 8508-8511, + 8517-8521, 8526, 8528-8585, 9312-9371, 9398-9471, 10102-10131, + 11264-11492, 11499-11502, 11506-11507, 11517, 11520-11557, 11559, + 11565, 11568-11623, 11631, 11648-11670, 11680-11686, 11688-11694, + 11696-11702, 11704-11710, 11712-11718, 11720-11726, 11728-11734, + 11736-11742, 11744-11775, 11823, 12293-12295, 12321-12329, 12337-12341, + 12344-12348, 12353-12438, 12445-12447, 12449-12538, 12540-12543, + 12549-12591, 12593-12686, 12690-12693, 12704-12735, 12784-12799, + 12832-12841, 12872-12879, 12881-12895, 12928-12937, 12977-12991, + 13312-19903, 19968-42124, 42192-42237, 42240-42508, 42512-42539, + 42560-42606, 42612-42619, 42623-42735, 42775-42783, 42786-42888, + 42891-42972, 42993-43013, 43015-43047, 43056-43061, 43072-43123, + 43136-43203, 43205, 43216-43225, 43250-43255, 43259, 43261-43306, + 43312-43346, 43360-43388, 43392-43442, 43444-43455, 43471-43481, + 43488-43518, 43520-43574, 43584-43597, 43600-43609, 43616-43638, + 43642-43710, 43712, 43714, 43739-43741, 43744-43759, 43762-43765, + 43777-43782, 43785-43790, 43793-43798, 43808-43814, 43816-43822, + 43824-43866, 43868-43881, 43888-44010, 44016-44025, 44032-55203, + 55216-55238, 55243-55291, 63744-64109, 64112-64217, 64256-64262, + 64275-64279, 64285-64296, 64298-64310, 64312-64316, 64318, 64320-64321, + 64323-64324, 64326-64433, 64467-64829, 64848-64911, 64914-64967, + 65008-65019, 65136-65140, 65142-65276, 65296-65305, 65313-65338, + 65345-65370, 65382-65470, 65474-65479, 65482-65487, 65490-65495, + 65498-65500, 65536-65547, 65549-65574, 65576-65594, 65596-65597, + 65599-65613, 65616-65629, 65664-65786, 65799-65843, 65856-65912, + 65930-65931, 66176-66204, 66208-66256, 66273-66299, 66304-66339, + 66349-66378, 66384-66426, 66432-66461, 66464-66499, 66504-66511, + 66513-66517, 66560-66717, 66720-66729, 66736-66771, 66776-66811, + 66816-66855, 66864-66915, 66928-66938, 66940-66954, 66956-66962, + 66964-66965, 66967-66977, 66979-66993, 66995-67001, 67003-67004, + 67008-67059, 67072-67382, 67392-67413, 67424-67431, 67456-67461, + 67463-67504, 67506-67514, 67584-67589, 67592, 67594-67637, 67639-67640, + 67644, 67647-67669, 67672-67702, 67705-67742, 67751-67759, 67808-67826, + 67828-67829, 67835-67867, 67872-67897, 67904-67929, 67968-68023, + 68028-68047, 68050-68099, 68101-68102, 68108-68115, 68117-68119, + 68121-68149, 68160-68168, 68192-68222, 68224-68255, 68288-68295, + 68297-68324, 68331-68335, 68352-68405, 68416-68437, 68440-68466, + 68472-68497, 68521-68527, 68608-68680, 68736-68786, 68800-68850, + 68858-68903, 68912-68921, 68928-68965, 68969, 68975-68997, 69216-69246, + 69248-69289, 69291-69292, 69296-69297, 69314-69319, 69370-69372, + 69376-69415, 69424-69445, 69457-69460, 69488-69505, 69552-69579, + 69600-69622, 69632-69701, 69714-69743, 69745-69749, 69760-69816, 69826, + 69840-69864, 69872-69881, 69888-69938, 69942-69951, 69956-69959, + 69968-70002, 70006, 70016-70079, 70081-70084, 70094-70106, 70108, + 70113-70132, 70144-70161, 70163-70196, 70199, 70206-70209, 70272-70278, + 70280, 70282-70285, 70287-70301, 70303-70312, 70320-70376, 70384-70393, + 70400-70403, 70405-70412, 70415-70416, 70419-70440, 70442-70448, + 70450-70451, 70453-70457, 70461-70468, 70471-70472, 70475-70476, 70480, + 70487, 70493-70499, 70528-70537, 70539, 70542, 70544-70581, + 70583-70592, 70594, 70597, 70599-70602, 70604-70605, 70609, 70611, + 70656-70721, 70723-70725, 70727-70730, 70736-70745, 70751-70753, + 70784-70849, 70852-70853, 70855, 70864-70873, 71040-71093, 71096-71102, + 71128-71133, 71168-71230, 71232, 71236, 71248-71257, 71296-71349, + 71352, 71360-71369, 71376-71395, 71424-71450, 71453-71466, 71472-71483, + 71488-71494, 71680-71736, 71840-71922, 71935-71942, 71945, 71948-71955, + 71957-71958, 71960-71989, 71991-71992, 71995-71996, 71999-72002, + 72016-72025, 72096-72103, 72106-72151, 72154-72159, 72161, 72163-72164, + 72192-72242, 72245-72254, 72272-72343, 72349, 72368-72440, 72544-72551, + 72640-72672, 72688-72697, 72704-72712, 72714-72758, 72760-72766, 72768, + 72784-72812, 72818-72847, 72850-72871, 72873-72886, 72960-72966, + 72968-72969, 72971-73014, 73018, 73020-73021, 73023-73025, 73027, + 73030-73031, 73040-73049, 73056-73061, 73063-73064, 73066-73102, + 73104-73105, 73107-73110, 73112, 73120-73129, 73136-73179, 73184-73193, + 73440-73462, 73472-73488, 73490-73530, 73534-73536, 73552-73561, 73648, + 73664-73684, 73728-74649, 74752-74862, 74880-75075, 77712-77808, + 77824-78895, 78913-78918, 78944-82938, 82944-83526, 90368-90414, + 90416-90425, 92160-92728, 92736-92766, 92768-92777, 92784-92862, + 92864-92873, 92880-92909, 92928-92975, 92992-92995, 93008-93017, + 93019-93025, 93027-93047, 93053-93071, 93504-93548, 93552-93561, + 93760-93846, 93856-93880, 93883-93907, 93952-94026, 94031-94087, + 94095-94111, 94176-94177, 94179, 94192-94198, 94208-101589, + 101631-101662, 101760-101874, 110576-110579, 110581-110587, + 110589-110590, 110592-110882, 110898, 110928-110930, 110933, + 110948-110951, 110960-111355, 113664-113770, 113776-113788, + 113792-113800, 113808-113817, 113822, 118000-118009, 119488-119507, + 119520-119539, 119648-119672, 119808-119892, 119894-119964, + 119966-119967, 119970, 119973-119974, 119977-119980, 119982-119993, + 119995, 119997-120003, 120005-120069, 120071-120074, 120077-120084, + 120086-120092, 120094-120121, 120123-120126, 120128-120132, 120134, + 120138-120144, 120146-120485, 120488-120512, 120514-120538, + 120540-120570, 120572-120596, 120598-120628, 120630-120654, + 120656-120686, 120688-120712, 120714-120744, 120746-120770, + 120772-120779, 120782-120831, 122624-122654, 122661-122666, + 122880-122886, 122888-122904, 122907-122913, 122915-122916, + 122918-122922, 122928-122989, 123023, 123136-123180, 123191-123197, + 123200-123209, 123214, 123536-123565, 123584-123627, 123632-123641, + 124112-124139, 124144-124153, 124368-124397, 124400-124410, + 124608-124638, 124640-124661, 124670-124671, 124896-124902, + 124904-124907, 124909-124910, 124912-124926, 124928-125124, + 125127-125135, 125184-125251, 125255, 125259, 125264-125273, + 126065-126123, 126125-126127, 126129-126132, 126209-126253, + 126255-126269, 126464-126467, 126469-126495, 126497-126498, 126500, + 126503, 126505-126514, 126516-126519, 126521, 126523, 126530, 126535, + 126537, 126539, 126541-126543, 126545-126546, 126548, 126551, 126553, + 126555, 126557, 126559, 126561-126562, 126564, 126567-126570, + 126572-126578, 126580-126583, 126585-126588, 126590, 126592-126601, + 126603-126619, 126625-126627, 126629-126633, 126635-126651, + 127232-127244, 127280-127305, 127312-127337, 127344-127369, + 130032-130041, 131072-173791, 173824-178205, 178208-183981, + 183984-191456, 191472-192093, 194560-195101, 196608-201546, + 201552-210041) + (Set 95)) + |}]; + re "\\s"; + [%expect {| (Set 9-13, 32, 133, 160, 5760, 8192-8202, 8232-8233, 8239, 8287, 12288) |}]; + re "\\S"; + [%expect {| + (Complement + (Set 9-13, 32, 133, 160, 5760, 8192-8202, 8232-8233, 8239, 8287, 12288)) + |}]; + re "\\d"; + [%expect {| + (Set 48-57, 1632-1641, 1776-1785, 1984-1993, 2406-2415, 2534-2543, 2662-2671, + 2790-2799, 2918-2927, 3046-3055, 3174-3183, 3302-3311, 3430-3439, + 3558-3567, 3664-3673, 3792-3801, 3872-3881, 4160-4169, 4240-4249, + 6112-6121, 6160-6169, 6470-6479, 6608-6617, 6784-6793, 6800-6809, + 6992-7001, 7088-7097, 7232-7241, 7248-7257, 42528-42537, 43216-43225, + 43264-43273, 43472-43481, 43504-43513, 43600-43609, 44016-44025, + 65296-65305, 66720-66729, 68912-68921, 68928-68937, 69734-69743, + 69872-69881, 69942-69951, 70096-70105, 70384-70393, 70736-70745, + 70864-70873, 71248-71257, 71360-71369, 71376-71395, 71472-71481, + 71904-71913, 72016-72025, 72688-72697, 72784-72793, 73040-73049, + 73120-73129, 73184-73193, 73552-73561, 90416-90425, 92768-92777, + 92864-92873, 93008-93017, 93552-93561, 118000-118009, 120782-120831, + 123200-123209, 123632-123641, 124144-124153, 124401-124410, 125264-125273, + 130032-130041) + |}]; + re "\\D"; + [%expect {| + (Complement + (Set 48-57, 1632-1641, 1776-1785, 1984-1993, 2406-2415, 2534-2543, + 2662-2671, 2790-2799, 2918-2927, 3046-3055, 3174-3183, 3302-3311, + 3430-3439, 3558-3567, 3664-3673, 3792-3801, 3872-3881, 4160-4169, + 4240-4249, 6112-6121, 6160-6169, 6470-6479, 6608-6617, 6784-6793, + 6800-6809, 6992-7001, 7088-7097, 7232-7241, 7248-7257, 42528-42537, + 43216-43225, 43264-43273, 43472-43481, 43504-43513, 43600-43609, + 44016-44025, 65296-65305, 66720-66729, 68912-68921, 68928-68937, + 69734-69743, 69872-69881, 69942-69951, 70096-70105, 70384-70393, + 70736-70745, 70864-70873, 71248-71257, 71360-71369, 71376-71395, + 71472-71481, 71904-71913, 72016-72025, 72688-72697, 72784-72793, + 73040-73049, 73120-73129, 73184-73193, 73552-73561, 90416-90425, + 92768-92777, 92864-92873, 93008-93017, 93552-93561, 118000-118009, + 120782-120831, 123200-123209, 123632-123641, 124144-124153, + 124401-124410, 125264-125273, 130032-130041)) + |}] +;; + +let%expect_test "zero-width assertions" = + re "\\b"; + [%expect {| (Alternative Beg_of_wordEnd_of_word) |}]; + re "\\B"; + [%expect {| Not_bound |}]; + re "\\A"; + [%expect {| Beg_of_str |}]; + re "\\Z"; + [%expect {| Last_end_of_line |}]; + re "\\z"; + [%expect {| End_of_str |}]; + re "\\G"; + [%expect {| Start |}] +;; + +let%expect_test "options" = + re ~opts:[ `Anchored ] "a"; + [%expect {| (Sequence Start(Set 97)) |}]; + re ~opts:[ `Caseless ] "b"; + [%expect {| (No_case (Set 98)) |}]; + re ~opts:[ `Dollar_endonly ] "$"; + [%expect {| Last_end_of_line |}]; + re ~opts:[ `Dollar_endonly; `Multiline ] "$"; + [%expect {| End_of_line |}]; + re ~opts:[ `Dotall ] "."; + [%expect {| (Set 0-55295, 57344-1114111) |}]; + re ~opts:[ `Multiline ] "^"; + [%expect {| Beg_of_line |}]; + re ~opts:[ `Multiline ] "$"; + [%expect {| End_of_line |}]; + re ~opts:[ `Ungreedy ] "a*"; + [%expect {| (Sem_greedy Non_greedy (Repeat (Set 97) 0)) |}]; + re ~opts:[ `Ungreedy ] "a*?"; + [%expect {| (Sem_greedy Greedy (Repeat (Set 97) 0)) |}] +;; + +let%expect_test "clustering" = + re "(?:a)"; + [%expect {| (Set 97) |}]; + re "(?:a|b)c"; + [%expect {| (Sequence (Alternative (Set 97)(Set 98))(Set 99)) |}] +;; + +let%expect_test "comment" = + re "a(?#comment)b"; + [%expect {| (Sequence (Set 97)(Sequence )(Set 98)) |}]; + try_parse "(?#"; + [%expect {| Parse error |}] +;; + +let%expect_test "backrefs" = + try_parse "\\0"; + [%expect {| Not supported |}] +;; + +let%expect_test "ordinary characters" = + re "a"; + [%expect {| (Set 97) |}] +;; + +let%expect_test "concacentation" = + re "ab"; + [%expect {| (Sequence (Set 97)(Set 98)) |}] +;; + +let%expect_test "sets in classes" = + re "[a\\s]"; + [%expect {| + (Alternative + (Set 9-13, 32, 133, 160, 5760, 8192-8202, 8232-8233, 8239, 8287, 12288) + (Set 97)) + |}] +;; + +let%expect_test "fixed bug" = + (try ignore (Re.compile (Re.Perl.re "(.*?)(\\WPl|\\Bpl)(.*)")) with + | _ -> failwith "bug in Re.handle_case"); + [%expect {||}] +;; diff --git a/lib_test/unicode/expect/test_posix.ml b/lib_test/unicode/expect/test_posix.ml new file mode 100644 index 00000000..707453fa --- /dev/null +++ b/lib_test/unicode/expect/test_posix.ml @@ -0,0 +1,10 @@ +open Import + +let%expect_test "class space" = + let re = Re.Posix.compile_pat {|a[[:space:]]b|} in + let exec = Re.execp re in + assert (exec "a b"); + assert (not (exec "ab")); + assert (not (exec "a_b")); + [%expect {||}] +;; diff --git a/lib_test/unicode/expect/test_re.ml b/lib_test/unicode/expect/test_re.ml new file mode 100644 index 00000000..e99e17f3 --- /dev/null +++ b/lib_test/unicode/expect/test_re.ml @@ -0,0 +1,360 @@ +open Import +open Re + +let%expect_test "str" = + test_re (str "a") "a"; + [%expect {| [| (0, 1) |] |}]; + test_re (str "a") "b"; + [%expect {| Not_found |}] + +let%expect_test "char" = + test_re (alt [ char 'a' |> letter; char 'b' |> letter ]) "a"; + [%expect {| [| (0, 1) |] |}]; + test_re (alt [ char 'a' |> letter; char 'b' |> letter ]) "b"; + [%expect {| [| (0, 1) |] |}]; + test_re (alt [ char 'a' |> letter; char 'b' |> letter ]) "c"; + [%expect {| Not_found |}] + +let%expect_test "alt" = + test_re (alt [ char 'a' |> letter; char 'b' |> letter ]) "a"; + [%expect {| [| (0, 1) |] |}]; + test_re (alt [ char 'a' |> letter; char 'b' |> letter ]) "b"; + [%expect {| [| (0, 1) |] |}]; + test_re (alt [ char 'a' |> letter; char 'b' |> letter ]) "c"; + [%expect {| Not_found |}] + +let%expect_test "seq" = + test_re (seq [ char 'a' |> letter; char 'b' |> letter ]) "ab"; + [%expect {| [| (0, 2) |] |}]; + test_re (seq [ char 'a' |> letter; char 'b' |> letter ]) "ac"; + [%expect {| Not_found |}] + +let%expect_test "empty" = + test_re empty ""; + [%expect {| Not_found |}]; + test_re empty "a"; + [%expect {| Not_found |}] + +let%expect_test "epsilon" = + test_re epsilon ""; + [%expect {| [| (0, 0) |] |}]; + test_re epsilon "a"; + [%expect {| [| (0, 0) |] |}] + +let%expect_test "rep" = + test_re (rep (char 'a' |> letter)) ""; + [%expect {| [| (0, 0) |] |}]; + test_re (rep (char 'a' |> letter)) "a"; + [%expect {| [| (0, 1) |] |}]; + test_re (rep (char 'a' |> letter)) "aa"; + [%expect {| [| (0, 2) |] |}]; + test_re (rep (char 'a' |> letter)) "b"; + [%expect {| [| (0, 0) |] |}] + +let%expect_test "bol" = + test_re (seq [ bol; char 'a' |> letter ]) "ab"; + [%expect {| [| (0, 1) |] |}]; + test_re (seq [ bol; char 'a' |> letter ]) "b\na"; + [%expect {| [| (2, 3) |] |}]; + test_re (seq [ bol; char 'a' |> letter ]) "ba"; + [%expect {| Not_found |}] + +let%expect_test "eol" = + test_re (seq [ char 'a' |> letter; eol ]) "ba"; + [%expect {| [| (1, 2) |] |}]; + test_re (seq [ char 'a' |> letter; eol ]) "a\nb"; + [%expect {| [| (0, 1) |] |}]; + test_re (seq [ char 'a' |> letter; eol ]) "ba\n"; + [%expect {| [| (1, 2) |] |}]; + test_re (seq [ char 'a' |> letter; eol ]) "ab"; + [%expect {| Not_found |}] + +let%expect_test "bow" = + test_re (seq [ bow; char 'a' |> letter ]) "a"; + [%expect {| [| (0, 1) |] |}]; + test_re (seq [ bow; char 'a' |> letter ]) "bb aa"; + [%expect {| [| (3, 4) |] |}]; + test_re (seq [ bow; char 'a' |> letter ]) "ba ba"; + [%expect {| Not_found |}]; + test_re bow ";"; + [%expect {| Not_found |}]; + test_re bow ""; + [%expect {| Not_found |}] + +let%expect_test "eow" = + test_re (seq [ char 'a' |> letter; eow ]) "a"; + [%expect {| [| (0, 1) |] |}]; + test_re (seq [ char 'a' |> letter; eow ]) "bb aa"; + [%expect {| [| (4, 5) |] |}]; + test_re (seq [ char 'a' |> letter; eow ]) "ab ab"; + [%expect {| Not_found |}]; + test_re eow ";"; + [%expect {| Not_found |}]; + test_re eow ""; + [%expect {| Not_found |}] + +let%expect_test "bos" = + test_re (seq [ bos; char 'a' |> letter ]) "ab"; + [%expect {| [| (0, 1) |] |}]; + test_re (seq [ bos; char 'a' |> letter ]) "b\na"; + [%expect {| Not_found |}]; + test_re (seq [ bos; char 'a' |> letter ]) "ba"; + [%expect {| Not_found |}] + +let%expect_test "eos" = + test_re (seq [ char 'a' |> letter; eos ]) "ba"; + [%expect {| [| (1, 2) |] |}]; + test_re (seq [ char 'a' |> letter; eos ]) "a\nb"; + [%expect {| Not_found |}]; + test_re (seq [ char 'a' |> letter; eos ]) "ba\n"; + [%expect {| Not_found |}]; + test_re (seq [ char 'a' |> letter; eos ]) "ab"; + [%expect {| Not_found |}] + +let%expect_test "leol" = + test_re (seq [ char 'a' |> letter; leol ]) "ba"; + [%expect {| [| (1, 2) |] |}]; + test_re (seq [ char 'a' |> letter; leol ]) "a\nb"; + [%expect {| Not_found |}]; + test_re (seq [ char 'a' |> letter; leol ]) "ba\n"; + [%expect {| [| (1, 2) |] |}]; + test_re (seq [ char 'a' |> letter; leol ]) "ab"; + [%expect {| Not_found |}]; + test_re (alt [ str "b\n"; seq [ char 'a' |> letter; leol ] ]) "ab\n"; + [%expect {| [| (1, 3) |] |}] + +let%expect_test "start" = + test_re ~pos:1 (seq [ start; char 'a' |> letter ]) "xab"; + [%expect {| [| (1, 2) |] |}]; + test_re ~pos:1 (seq [ start; char 'a' |> letter ]) "xb\na"; + [%expect {| Not_found |}]; + test_re ~pos:1 (seq [ start; char 'a' |> letter ]) "xba"; + [%expect {| Not_found |}] + +let%expect_test "stop" = + test_re ~len:2 (seq [ char 'a' |> letter; stop ]) "bax"; + [%expect {| [| (1, 2) |] |}]; + test_re ~len:3 (seq [ char 'a' |> letter; stop ]) "a\nbx"; + [%expect {| Not_found |}]; + test_re ~len:3 (seq [ char 'a' |> letter; stop ]) "ba\nx"; + [%expect {| Not_found |}]; + test_re ~len:2 (seq [ char 'a' |> letter; stop ]) "abx"; + [%expect {| Not_found |}] + +let%expect_test "word" = + test_re (word (str "aa")) "aa"; + [%expect {| [| (0, 2) |] |}]; + test_re (word (str "aa")) "bb aa"; + [%expect {| [| (3, 5) |] |}]; + test_re (word (str "aa")) "aaa"; + [%expect {| Not_found |}]; + test_re (word (str "")) ""; + [%expect {| Not_found |}] + +let%expect_test "not_boundary" = + test_re (seq [ not_boundary; char 'b' |> letter; not_boundary ]) "abc"; + [%expect {| [| (1, 2) |] |}]; + test_re (seq [ char ';' |> letter; not_boundary; char ';' |> letter ]) ";;"; + [%expect {| [| (0, 2) |] |}]; + test_re (seq [ not_boundary; char ';' |> letter; not_boundary ]) ";"; + [%expect {| [| (0, 1) |] |}]; + test_re (seq [ not_boundary; char 'a' |> letter ]) "abc"; + [%expect {| Not_found |}]; + test_re (seq [ char 'c' |> letter; not_boundary ]) "abc"; + [%expect {| Not_found |}] + +let%expect_test "default match semantics" = + test_re + (seq + [ + rep (alt [ char 'a' |> letter; char 'b' |> letter ]); + char 'b' |> letter; + ]) + "aabaab"; + [%expect {| [| (0, 6) |] |}]; + test_re (alt [ str "aa"; str "aaa" ]) "aaaa"; + [%expect {| [| (0, 2) |] |}]; + test_re (alt [ str "aaa"; str "aa" ]) "aaaa"; + [%expect {| [| (0, 3) |] |}] + +let%expect_test "shortest match" = + test_re + (shortest + (seq + [ + rep (alt [ char 'a' |> letter; char 'b' |> letter ]); + char 'b' |> letter; + ])) + "aabaab"; + [%expect {| [| (0, 3) |] |}]; + test_re (shortest (alt [ str "aa"; str "aaa" ])) "aaaa"; + [%expect {| [| (0, 2) |] |}]; + test_re (shortest (alt [ str "aaa"; str "aa" ])) "aaaa"; + [%expect {| [| (0, 2) |] |}] + +let%expect_test "longest match" = + test_re + (longest + (seq + [ + rep (alt [ char 'a' |> letter; char 'b' |> letter ]); + char 'b' |> letter; + ])) + "aabaab"; + [%expect {| [| (0, 6) |] |}]; + test_re (longest (alt [ str "aa"; str "aaa" ])) "aaaa"; + [%expect {| [| (0, 3) |] |}]; + test_re (longest (alt [ str "aaa"; str "aa" ])) "aaaa"; + [%expect {| [| (0, 3) |] |}] + +let%expect_test "first match" = + test_re + (first + (seq + [ + rep (alt [ char 'a' |> letter; char 'b' |> letter ]); + char 'b' |> letter; + ])) + "aabaab"; + [%expect {| [| (0, 6) |] |}]; + test_re (first (alt [ str "aa"; str "aaa" ])) "aaaa"; + [%expect {| [| (0, 2) |] |}]; + test_re (first (alt [ str "aaa"; str "aa" ])) "aaaa"; + [%expect {| [| (0, 3) |] |}] + +let%expect_test "match_semantics" = + let r = rep (group (alt [ str "aaa"; str "aa" ])) in + test_re (longest r) "aaaaaaa"; + [%expect {| [| (0, 7); (5, 7) |] |}]; + test_re (first r) "aaaaaaa"; + [%expect {| [| (0, 6); (3, 6) |] |}]; + test_re (first (non_greedy r)) "aaaaaaa"; + [%expect {| [| (0, 0); (-1, -1) |] |}]; + test_re (shortest r) "aaaaaaa"; + [%expect {| [| (0, 0); (-1, -1) |] |}]; + let r' = rep (group (shortest (alt [ str "aaa"; str "aa" ]))) in + test_re (longest r') "aaaaaaa"; + [%expect {| [| (0, 7); (4, 7) |] |}]; + test_re (first r') "aaaaaaa"; + [%expect {| [| (0, 6); (4, 6) |] |}] + +let%expect_test "greedy" = + test_re + (greedy + (seq + [ + rep (alt [ char 'a' |> letter; char 'b' |> letter ]); + char 'b' |> letter; + ])) + "aabaab"; + [%expect {| [| (0, 6) |] |}]; + test_re (greedy (rep (group (opt (char 'a' |> letter))))) "aa"; + [%expect {| [| (0, 2); (2, 2) |] |}] + +let%expect_test "non_greedy" = + test_re + (non_greedy + (longest + (seq + [ + rep (alt [ char 'a' |> letter; char 'b' |> letter ]); + char 'b' |> letter; + ]))) + "aabaab"; + [%expect {| [| (0, 6) |] |}]; + test_re + (non_greedy + (first + (seq + [ + rep (alt [ char 'a' |> letter; char 'b' |> letter ]); + char 'b' |> letter; + ]))) + "aabaab"; + [%expect {| [| (0, 3) |] |}]; + test_re (non_greedy (longest (rep (group (opt (char 'a' |> letter)))))) "aa"; + [%expect {| [| (0, 2); (1, 2) |] |}] + +let%expect_test "set" = + test_re (rep1 (set "abcd")) "bcbadbabcdba"; + [%expect {| [| (0, 12) |] |}]; + test_re (set "abcd") "e"; + [%expect {| Not_found |}] + +let%expect_test "rg" = + test_re (rep1 (rg (char '0') (char '9'))) "0123456789"; + [%expect {| [| (0, 10) |] |}]; + test_re (rep1 (rg (char '0') (char '9'))) "a"; + [%expect {| Not_found |}] + +let%expect_test "inter" = + test_re + (rep1 (inter [ rg (char '0') (char '9'); rg (char '4') (char '6') ])) + "456"; + [%expect {| [| (0, 3) |] |}]; + test_re + (rep1 (inter [ rg (char '0') (char '9'); rg (char '4') (char '6') ])) + "7"; + [%expect {| Not_found |}]; + test_re + (inter + [ alt [ char 'a' |> letter; char 'b' |> letter ]; char 'b' |> letter ]) + "b"; + [%expect {| [| (0, 1) |] |}] + +let%expect_test "diff" = + test_re + (rep1 (diff (rg (char '0') (char '9')) (rg (char '4') (char '6')))) + "0123789"; + [%expect {| [| (0, 7) |] |}]; + test_re + (rep1 (diff (rg (char '0') (char '9')) (rg (char '4') (char '6')))) + "4"; + [%expect {| Not_found |}] + +let%expect_test "compl" = + test_re + (rep1 (compl [ rg (char '0') (char '9'); rg (char 'a') (char 'z') ])) + "A:Z+"; + [%expect {| [| (0, 4) |] |}]; + test_re + (rep1 (compl [ rg (char '0') (char '9'); rg (char 'a') (char 'z') ])) + "0"; + [%expect {| Not_found |}]; + test_re + (rep1 (compl [ rg (char '0') (char '9'); rg (char 'a') (char 'z') ])) + "a"; + [%expect {| Not_found |}] + +let%expect_test "case" = + test_re (case (str "abc")) "abc"; + [%expect {| [| (0, 3) |] |}]; + test_re (no_case (case (str "abc"))) "abc"; + [%expect {| [| (0, 3) |] |}]; + test_re (case (str "abc")) "ABC"; + [%expect {| Not_found |}]; + test_re (no_case (case (str "abc"))) "ABC"; + [%expect {| Not_found |}] + +let%expect_test "no_case" = + test_re (no_case (str "abc")) "abc"; + [%expect {| [| (0, 3) |] |}]; + test_re (no_case (str "abc")) "ABC"; + [%expect {| [| (0, 3) |] |}]; + test_re (case (no_case (str "abc"))) "abc"; + [%expect {| [| (0, 3) |] |}]; + test_re (case (no_case (str "abc"))) "ABC"; + [%expect {| [| (0, 3) |] |}] + +let%expect_test "witness" = + let t re = print_endline (witness re) in + t (set "ac"); + [%expect {| a |}]; + t (repn (str "foo") 3 None); + [%expect {| foofoofoo |}]; + t (alt [ char 'c' |> letter; char 'd' |> letter ]); + [%expect {| c |}]; + t (no_case (str "test")); + [%expect {| TEST |}]; + t eol; + [%expect {| |}] diff --git a/lib_test/unicode/expect/test_replace.ml b/lib_test/unicode/expect/test_replace.ml new file mode 100644 index 00000000..1ec780bf --- /dev/null +++ b/lib_test/unicode/expect/test_replace.ml @@ -0,0 +1,33 @@ +open Import + +let%expect_test "test_replace" = + let re = Re.Posix.compile_pat "[a-zA-Z]+" in + let f sub = String.capitalize_ascii (Re.Group.get sub 0) in + print_endline (Re.replace re ~f " hello world; I love chips!"); + [%expect {| Hello World; I Love Chips! |}]; + print_endline (Re.replace ~all:false re ~f " allo maman, bobo"); + [%expect {| Allo maman, bobo |}]; + print_endline (Re.replace re_empty ~f:(fun _ -> "a") ""); + [%expect {| a |}]; + print_endline (Re.replace (Re.compile (Re.rep (Re.(char 'a' |> letter)))) ~f:(fun _ -> "*") "cat"); + [%expect {| *c*t* |}] +;; + +let%expect_test "test_replace_string" = + let re = Re.Posix.compile_pat "_[a-zA-Z]+_" in + print_endline (Re.replace_string re ~by:"goodbye" "_hello_ world"); + [%expect {| goodbye world |}]; + print_endline (Re.replace_string ~all:false re ~by:"brown" "The quick _XXX_ fox"); + [%expect {| The quick brown fox |}] +;; + +let%expect_test "test_bug_55" = + let re = Re.(compile bol) in + let res = Re.replace_string re ~by:"z" "abc" in + print_endline res; + [%expect {| zabc |}]; + let re = Re.(compile eow) in + let res = Re.replace_string re ~by:"X" "one two three" in + print_endline res; + [%expect {| oneX twoX threeX |}] +;; diff --git a/lib_test/unicode/expect/test_repn.ml b/lib_test/unicode/expect/test_repn.ml new file mode 100644 index 00000000..69110cfc --- /dev/null +++ b/lib_test/unicode/expect/test_repn.ml @@ -0,0 +1,80 @@ +open Import +open Re + +let%expect_test "fixed repetition" = + let re = Re.compile @@ Re.(repn (char 'a' |> letter) 3 (Some 3)) in + let test s = printf "%b\n" (Re.execp re s) in + test ""; + [%expect {| false |}]; + test "aa"; + [%expect {| false |}]; + test "aaa"; + [%expect {| true |}]; + test "aaaa"; + [%expect {| true |}] +;; + +let%expect_test "repn" = + let a = char 'a' |> letter in + test_re (repn a 0 None) ""; + [%expect {| [| (0, 0) |] |}]; + test_re (repn a 2 None) "a"; + [%expect {| Not_found |}]; + test_re (repn a 2 None) "aa"; + [%expect {| [| (0, 2) |] |}]; + test_re (repn a 0 (Some 0)) ""; + [%expect {| [| (0, 0) |] |}]; + test_re (repn a 1 (Some 2)) "a"; + [%expect {| [| (0, 1) |] |}]; + test_re (repn a 1 (Some 2)) "aa"; + [%expect {| [| (0, 2) |] |}]; + test_re (repn a 1 (Some 2)) ""; + [%expect {| Not_found |}]; + test_re (repn a 1 (Some 2)) "aaa"; + [%expect {| [| (0, 2) |] |}]; + invalid_argument (fun () -> repn empty (-1) None); + [%expect {| Invalid_argument "Re.repn" |}]; + invalid_argument (fun () -> repn empty 1 (Some 0)); + [%expect {| Invalid_argument "Re.repn" |}]; + invalid_argument (fun () -> repn empty 4 (Some 3)); + [%expect {| Invalid_argument "Re.repn" |}] +;; + +let char c = Re.char c |> letter + +let%expect_test "rep1" = + test_re (rep1 (char 'a')) "a"; + [%expect {| [| (0, 1) |] |}]; + test_re (rep1 (char 'a')) "aa"; + [%expect {| [| (0, 2) |] |}]; + test_re (rep1 (char 'a')) ""; + [%expect {| Not_found |}]; + test_re (rep1 (char 'a')) "b"; + [%expect {| Not_found |}] +;; + +let%expect_test "opt" = + test_re (opt (char 'a')) ""; + [%expect {| [| (0, 0) |] |}]; + test_re (opt (char 'a')) "a"; + [%expect {| [| (0, 1) |] |}] +;; + +let copy s n = + let len = String.length s in + let b = Bytes.make (len * n) '\000' in + for i = 0 to n - 1 do + Bytes.blit_string s 0 b (i * len) len + done; + Bytes.to_string b +;; + +let%expect_test "repeat sequence" = + let s = "abcde" in + let re = str s |> rep |> whole_string |> compile in + for i = 0 to 3 do + let r = copy s i in + assert (Re.execp re r) + done; + [%expect {||}] +;; diff --git a/lib_test/unicode/expect/test_split.ml b/lib_test/unicode/expect/test_split.ml new file mode 100644 index 00000000..04e60d33 --- /dev/null +++ b/lib_test/unicode/expect/test_split.ml @@ -0,0 +1,76 @@ +open Import + +let re_whitespace = Re.Posix.compile_pat "[\t ]+" +let re_eol = Re.compile Re.eol +let re_bow = Re.compile Re.bow +let re_eow = Re.compile Re.eow + +let%expect_test "split" = + let split ?pos ?len re s = strings (Re.split ?pos ?len re s) in + split re_whitespace "aa bb c d "; + [%expect {| ["aa"; "bb"; "c"; "d"] |}]; + split ~pos:1 ~len:4 re_whitespace "aa b c d"; + [%expect {| ["a"; "b"] |}]; + split re_whitespace " a full_word bc "; + [%expect {| ["a"; "full_word"; "bc"] |}]; + split re_empty "abcd"; + [%expect {| ["a"; "b"; "c"; "d"] |}]; + split re_eol "a\nb"; + [%expect {| + ["a"; "\nb"] |}]; + split re_bow "a b"; + [%expect {| ["a "; "b"] |}]; + split re_eow "a b"; + [%expect {| ["a"; " b"] |}]; + split re_whitespace ""; + [%expect {| [] |}]; + split re_empty ""; + [%expect {| [] |}] +;; + +let%expect_test "split_delim" = + let split_delim ?pos ?len re s = strings (Re.split_delim ?pos ?len re s) in + split_delim re_whitespace "aa bb c d "; + [%expect {| ["aa"; "bb"; "c"; "d"; ""] |}]; + split_delim ~pos:1 ~len:4 re_whitespace "aa b c d"; + [%expect {| ["a"; "b"; ""] |}]; + split_delim re_whitespace " a full_word bc "; + [%expect {| [""; "a"; "full_word"; "bc"; ""] |}]; + split_delim re_empty "abcd"; + [%expect {| [""; "a"; "b"; "c"; "d"; ""] |}]; + split_delim re_eol "a\nb"; + [%expect {| ["a"; "\nb"; ""] |}]; + split_delim re_bow "a b"; + [%expect {| [""; "a "; "b"] |}]; + split_delim re_eow "a b"; + [%expect {| ["a"; " b"; ""] |}]; + split_delim re_whitespace ""; + [%expect {| [""] |}]; + split_delim re_empty ""; + [%expect {| [""; ""] |}] +;; + +let%expect_test "split_full" = + let split_full ?pos ?len re s = + let res = Re.split_full ?pos ?len re s in + Format.printf + "[%a]@." + Fmt.( + list ~pp_sep:(Fmt.lit "; ") (fun fmt what -> + match what with + | `Text s -> Format.fprintf fmt "`T %S" s + | `Delim s -> Format.fprintf fmt "`D %S" (Re.Group.get s 0))) + res + in + split_full re_whitespace "aa bb c d "; + [%expect {| [`T "aa"; `D " "; `T "bb"; `D " "; `T "c"; `D " "; `T "d"; `D " "] |}]; + split_full ~pos:1 ~len:5 re_whitespace "aa \tb c d"; + [%expect {| [`T "a"; `D " \t"; `T "b"; `D " "] |}]; + split_full re_whitespace " a full_word bc "; + [%expect {| [`D " "; `T "a"; `D " "; `T "full_word"; `D " "; `T "bc"; `D " "] |}]; + split_full re_empty "ab"; + [%expect {| [`D ""; `T "a"; `D ""; `T "b"; `D ""] |}]; + split_full Re.(compile (rep (char 'a' |> letter))) "cat"; + [%expect {| [`D ""; `T "c"; `D "a"; `T "t"; `D ""] |}]; + () +;; diff --git a/lib_test/unicode/expect/test_str.ml b/lib_test/unicode/expect/test_str.ml new file mode 100644 index 00000000..c1c4b6ea --- /dev/null +++ b/lib_test/unicode/expect/test_str.ml @@ -0,0 +1,333 @@ +open Import + +module type Str_intf = module type of Str + +module Test_matches (R : Str_intf) = struct + let groups () = + let group i = + try `Found (R.group_beginning i) with + | Not_found -> `Not_found + | Invalid_argument _ -> `Not_exists + in + let rec loop acc i = + match group i with + | `Found p -> loop ((p, R.group_end i) :: acc) (i + 1) + | `Not_found -> loop ((-1, -1) :: acc) (i + 1) + | `Not_exists -> List.rev acc + in + loop [] 0 + ;; + + let eq_match ?(pos = 0) ?(case = true) r s = + let pat = if case then R.regexp r else R.regexp_case_fold r in + try + ignore (R.search_forward pat s pos); + Some (groups ()) + with + | Not_found -> None + ;; + + let eq_match' ?(pos = 0) ?(case = true) r s = + let pat = if case then R.regexp r else R.regexp_case_fold r in + try + ignore (R.string_match pat s pos); + Some (groups ()) + with + | Not_found -> None + ;; +end + +module T_str = Test_matches (Str) +module T_re = Test_matches (Re.Str) + +let test dyn_of_ok str re args = + let run f = + match f () with + | s -> Ok s + | exception exn -> Error exn + in + let str = run (fun () -> str args) in + let re = run (fun () -> re args) in + if not (Poly.equal str re) + then ( + let printer x = + let dyn = + let open Dyn in + result dyn_of_ok (fun x -> string (Printexc.to_string x)) x + in + sexp_of_dyn dyn |> Base.Sexp.to_string_hum + in + Printf.printf "str: %s\n" (printer str); + Printf.printf "re: %s\n" (printer re)) +;; + +let dyn_of_pairs x = + Dyn.option + (fun x -> + List.map x ~f:(fun (start, stop) -> + let open Dyn in + pair (int start) (int stop)) + |> Dyn.list) + x +;; + +let split_result_conv = + List.map ~f:(function + | Str.Delim x -> Re.Str.Delim x + | Str.Text x -> Re.Str.Text x) +;; + +let dyn_split_result_list list = + List.map + list + ~f: + (let open Dyn in + function + | Re.Str.Delim x -> variant "Delim" [ string x ] + | Text s -> variant "Text" [ string s ]) + |> Dyn.list +;; + +type ('a, 'b) test = + { name : string + ; dyn_of_ok : 'b -> Dyn.t + ; re_str : Re.Str.regexp -> 'a -> 'b + ; str : Str.regexp -> 'a -> 'b + } + +let bounded_split_t = + { name = "bounded_split" + ; dyn_of_ok = (fun x -> Dyn.list (List.map x ~f:Dyn.string)) + ; re_str = (fun re (s, n) -> Re.Str.bounded_split re s n) + ; str = (fun re (s, n) -> Str.bounded_split re s n) + } +;; + +let bounded_full_split_t = + { name = "bounded_full_split" + ; dyn_of_ok = dyn_split_result_list + ; re_str = (fun re (s, n) -> Re.Str.bounded_full_split re s n) + ; str = (fun re (s, n) -> split_result_conv (Str.bounded_full_split re s n)) + } +;; + +let full_split_t = + { bounded_full_split_t with + name = "full_split" + ; re_str = (fun re s -> Re.Str.full_split re s) + ; str = (fun re s -> split_result_conv (Str.full_split re s)) + } +;; + +let split_delim_t = + { name = "split_delim" + ; dyn_of_ok = (fun x -> Dyn.list (List.map x ~f:Dyn.string)) + ; re_str = Re.Str.split_delim + ; str = Str.split_delim + } +;; + +let split_t = + { name = "split" + ; dyn_of_ok = (fun x -> Dyn.list (List.map x ~f:Dyn.string)) + ; re_str = Re.Str.split + ; str = Str.split + } +;; + +let global_replace_t = + { name = "global_replace" + ; dyn_of_ok = Dyn.string + ; re_str = (fun re (r, s) -> Re.Str.global_replace re r s) + ; str = (fun re (r, s) -> Str.global_replace re r s) + } +;; + +let eq_match ?pos ?case re = + test dyn_of_pairs (T_str.eq_match ?pos ?case re) (T_re.eq_match ?pos ?case re) +;; + +let eq_match' ?pos ?case re = + test dyn_of_pairs (T_str.eq_match' ?pos ?case re) (T_re.eq_match' ?pos ?case re) +;; + +let test t re args = + test t.dyn_of_ok (t.re_str (Re.Str.regexp re)) (t.str (Str.regexp re)) args +;; + +let split_delim re s = test split_delim_t re s +let split re s = test split_t re s +let full_split re s = test full_split_t re s +let bounded_split re s n = test bounded_split_t re (s, n) +let bounded_full_split re s n = test bounded_full_split_t re (s, n) +let global_replace re r s = test global_replace_t re (r, s) + +let%expect_test "literal match" = + eq_match "a" "a"; + eq_match "a" "b"; + [%expect {||}] +;; + +let%expect_test "alt" = + eq_match "a\\|b" "a"; + eq_match "a\\|b" "b"; + eq_match "a\\|b" "c"; + [%expect {||}] +;; + +let%expect_test "seq" = + eq_match "ab" "ab"; + eq_match "ab" "ac"; + [%expect {||}] +;; + +let%expect_test "epsilon" = + eq_match "" ""; + eq_match "" "a"; + [%expect {||}] +;; + +let%expect_test "rep" = + eq_match "a*" ""; + eq_match "a*" "a"; + eq_match "a*" "aa"; + eq_match "a*" "b"; + [%expect {||}] +;; + +let%expect_test "rep1" = + eq_match "a+" "a"; + eq_match "a+" "aa"; + eq_match "a+" ""; + eq_match "a+" "b"; + [%expect {| |}] +;; + +let%expect_test "opt" = + eq_match "a?" ""; + eq_match "a?" "a"; + [%expect {| |}] +;; + +let%expect_test "bol" = + eq_match "^a" "ab"; + eq_match "^a" "b\na"; + eq_match "^a" "ba"; + [%expect {| |}] +;; + +let%expect_test "eol" = + eq_match "a$" "ba"; + eq_match "a$" "a\nb"; + eq_match "a$" "ba\n"; + eq_match "a$" "ab"; + [%expect {| |}] +;; + +let%expect_test "start" = + eq_match ~pos:1 "Za" "xab"; + eq_match ~pos:1 "Za" "xb\na"; + eq_match ~pos:1 "Za" "xba"; + [%expect {||}] +;; + +let%expect_test "match semantics" = + eq_match "\\(a\\|b\\)*b" "aabaab"; + eq_match "aa\\|aaa" "aaaa"; + eq_match "aaa\\|aa" "aaaa"; + [%expect {||}] +;; + +let%expect_test "Group (or submatch)" = + eq_match "\\(a\\)\\(a\\)?\\(b\\)" "ab"; + [%expect {| |}]; + eq_match "\\(foo" "foo"; + [%expect {| + str: (Error "Failure(\"\\\\( group not closed by \\\\)\")") + re: (Error Re_private_unicode.Emacs.Parse_error) + |}] +;; + +let%expect_test "Character set" = + eq_match "[0-9]+" "0123456789"; + eq_match "[0-9]+" "a"; + eq_match "[9-0]+" "2"; + eq_match "[5-5]" "5"; + eq_match "[5-4]" "1"; + eq_match' "[]]" "]"; + eq_match' "[a-]" "-"; + eq_match' "[-a]" "-"; + eq_match' "]" "]"; + eq_match' "[^b-f]" "z"; + eq_match' "[^b-f]" "a"; + [%expect {||}]; + (* These errors aren't correct *) + eq_match' "[]" "x"; + eq_match' "[" "["; + [%expect + {| + str: (Error "Failure(\"[ class not closed by ]\")") + re: (Error Re_private_unicode.Emacs.Parse_error) + str: (Error "Failure(\"[ class not closed by ]\")") + re: (Error Re_private_unicode.Emacs.Parse_error) + |}] +;; + +let%expect_test "compl" = + eq_match "[^0-9a-z]+" "A:Z+"; + eq_match "[^0-9a-z]+" "0"; + eq_match "[^0-9a-z]+" "a"; + [%expect {| + str: (Ok (Some ((0 4)))) + re: (Ok (Some ((3 4)))) + |}] +;; + +let%expect_test "Word modifiers" = + eq_match' "\\bfoo" "foo"; + eq_match' "\\" "foo"; + eq_match' "z\\Bfoo" "zfoo"; + eq_match' "\\`foo" "foo"; + eq_match' "foo\\'" "foo"; + [%expect {||}] +;; + +let%expect_test "Case modifiers" = + eq_match ~case:false "abc" "abc"; + eq_match ~case:false "abc" "ABC"; + [%expect {| |}] +;; + +let%expect_test "global_replace" = + global_replace "needle" "test" "needlehaystack"; + global_replace "needle" "" ""; + global_replace "needle" "" "needle"; + global_replace "xxx" "yyy" "zzz"; + global_replace "test\\([0-9]*\\)" "\\1-foo-\\1" "test100 test200 test"; + global_replace "test\\([0-9]*\\)" "'\\-0'" "test100 test200 test"; + (* Regrssion test for #129 *) + global_replace "\\(X+\\)" "A\\1YY" "XXXXXXZZZZ"; + [%expect {||}] +;; + +let%expect_test "bounded_split, bounded_full_split" = + [ ",", "foo,bar,baz", 5 + ; ",", "foo,bar,baz", 1 + ; ",", "foo,bar,baz", 0 + ; ",\\|", "foo,bar|baz", 4 + ] + |> List.iter ~f:(fun (re, s, n) -> + bounded_full_split re s n; + bounded_split re s n); + [%expect {||}] +;; + +let%expect_test "split, full_split, split_delim" = + [ "re", ""; " ", "foo bar"; "\b", "one-two three"; "[0-9]", "One3TwoFive" ] + |> List.iter ~f:(fun (re, s) -> + split re s; + full_split re s; + split_delim re s); + [%expect {||}] +;; diff --git a/lib_test/unicode/expect/test_stream.ml b/lib_test/unicode/expect/test_stream.ml new file mode 100644 index 00000000..8be5c570 --- /dev/null +++ b/lib_test/unicode/expect/test_stream.ml @@ -0,0 +1,203 @@ +open Import +module Stream = Re.Stream + +let feed t str = + let res = Stream.feed t str ~pos:0 ~len:(String.length str) in + let () = + match res with + | No_match -> Printf.printf "%S did not match\n" str + | Ok s -> + let status = + match Stream.finalize s "" ~pos:0 ~len:0 with + | true -> "matched" + | false -> "unmatched" + in + Printf.printf "%S not matched (status = %s)\n" str status + in + res +;; + +let%expect_test "codec error" = + let stream = Re.any |> Re.compile |> Stream.create in + ignore_or_exception (fun () -> ignore (Stream.feed stream "foo" ~pos:2 ~len:3)); + [%expect {| "CodecError" |}]; + ignore_or_exception (fun () -> ignore (Stream.finalize stream "foo" ~pos:2 ~len:3)); + [%expect {| "CodecError" |}]; + let stream = Stream.Group.create stream in + ignore_or_exception (fun () -> ignore (Stream.Group.feed stream "foo" ~pos:2 ~len:3)); + [%expect {| "CodecError" |}]; + ignore_or_exception (fun () -> ignore (Stream.Group.finalize stream "foo" ~pos:2 ~len:3)); + [%expect {| "CodecError" |}] +;; + +let%expect_test "basic" = + let s = [ Re.bos; Re.str "abab" ] |> Re.seq |> Re.compile |> Stream.create in + ignore (feed s "x"); + [%expect {| "x" did not match |}]; + let suffix = "ab" in + let s = + match feed s suffix with + | Ok s -> s + | No_match -> assert false + in + [%expect {| + "ab" not matched (status = unmatched) |}]; + (let (_ : _ Stream.feed) = feed s "ab" in + [%expect {| + "ab" not matched (status = matched) |}]); + let (_ : _ Stream.feed) = feed s "xy" in + [%expect {| + "xy" did not match |}] +;; + +let%expect_test "eos" = + let s = [ Re.str "zzz"; Re.eos ] |> Re.seq |> Re.compile |> Stream.create in + ignore (feed s "zzz"); + [%expect {| "zzz" not matched (status = matched) |}]; + let s = + match feed s "z" with + | Ok s -> s + | No_match -> assert false + in + [%expect {| "z" not matched (status = unmatched) |}]; + (let str = "zz" in + match Stream.finalize s str ~pos:0 ~len:(String.length str) with + | true -> () + | false -> assert false); + [%expect {||}] +;; + +let%expect_test "finalize empty" = + let s = "abde" in + let stream = + let stream = Re.str s |> Re.whole_string |> Re.compile |> Stream.create in + match feed stream s with + | Ok s -> s + | No_match -> assert false + in + assert (Stream.finalize stream "" ~pos:0 ~len:0); + [%expect {| "abde" not matched (status = matched) |}] +;; + +let%expect_test "group - basic" = + let s = + let open Re in + str "foo" |> whole_string |> group |> compile |> Stream.create + in + let g = Stream.Group.create s in + let g = + match Stream.Group.feed g "f" ~pos:0 ~len:1 with + | No_match -> assert false + | Ok s -> s + in + (match Stream.Group.finalize g "oo" ~pos:0 ~len:2 with + | Ok _ -> () + | No_match -> assert false); + [%expect {| |}] +;; + +let pmarks set m = + Printf.printf "mark present %b\n" (Re.Stream.Group.Match.test_mark set m) +;; + +let%expect_test "group - mark entire string must match" = + let m1, f = Re.(mark (char 'f' |> letter)) in + let m2, oo = Re.(mark (str "oo")) in + let re = + let open Re in + [ f; oo ] |> seq |> compile + in + let s = Stream.create re in + let g = Stream.Group.create s in + let g = + match Stream.Group.feed g "f" ~pos:0 ~len:1 with + | No_match -> assert false + | Ok s -> s + in + let g = + match Stream.Group.finalize g "oo" ~pos:0 ~len:2 with + | Ok g -> g + | No_match -> assert false + in + pmarks g m1; + [%expect {| mark present true |}]; + pmarks g m2; + [%expect {| mark present true |}] +;; + +let%expect_test "group - partial mark match" = + let m, foo = Re.(mark (str "foo")) in + let re = Re.compile foo in + let s = Stream.create re in + let g = Stream.Group.create s in + let g = + match Stream.Group.feed g "xx" ~pos:0 ~len:2 with + | No_match -> assert false + | Ok g -> g + in + let g = + match Stream.Group.feed g "foo" ~pos:0 ~len:3 with + | Ok g -> g + | No_match -> assert false + in + let g = + match Stream.Group.finalize g "garb" ~pos:0 ~len:4 with + | Ok g -> g + | No_match -> assert false + in + pmarks g m; + [%expect {| mark present true |}] +;; + +let print_match match_ n = + match Stream.Group.Match.get match_ n with + | None -> Printf.printf "match %d: \n" n + | Some s -> Printf.printf "match %d: %s\n" n s +;; + +let%expect_test "group - match group" = + let stream = + let re = Re.Pcre.re "_([a-z]+)_" |> Re.whole_string |> Re.compile in + Stream.Group.create (Stream.create re) + in + let s = "_abc_" in + let () = + match Stream.Group.finalize stream s ~pos:0 ~len:(String.length s) with + | No_match -> assert false + | Ok m -> + for i = 0 to 1 do + print_match m i + done + in + [%expect {| + match 0: _abc_ + match 1: abc + |}] +;; + +let%expect_test "group - match group" = + let stream = + let re = Re.Pcre.re "_([a-z]+)__([a-z]+)_" |> Re.whole_string |> Re.compile in + Stream.Group.create (Stream.create re) + in + let s = "_abc_" in + let stream = + match Stream.Group.feed stream s ~pos:0 ~len:(String.length s) with + | No_match -> assert false + | Ok m -> m + in + let s = "_de_" in + let () = + match Stream.Group.finalize stream s ~pos:0 ~len:(String.length s) with + | No_match -> assert false + | Ok m -> + for i = 0 to 2 do + print_match m i + done + in + [%expect {| + match 0: _abc__de_ + match 1: abc + match 2: de + |}] +;; diff --git a/lib_test/unicode/expect/test_validation.ml b/lib_test/unicode/expect/test_validation.ml new file mode 100644 index 00000000..1be75fd3 --- /dev/null +++ b/lib_test/unicode/expect/test_validation.ml @@ -0,0 +1,12 @@ +open Import + +let () = Printexc.record_backtrace false +let any = Re.(compile (rep any)) + +let%expect_test "bound errors" = + let (_ : bool) = Re.execp any ~pos:4 "foo" in + [%expect.unreachable]; + let (_ : bool) = Re.execp any ~pos:1 ~len:3 "foo" in + [%expect.unreachable] +[@@expect.uncaught_exn {| (Re_private_unicode.Uucodecs.CodecError) |}] +;; diff --git a/lib_test/unicode/expect/test_view.ml b/lib_test/unicode/expect/test_view.ml new file mode 100644 index 00000000..b04d28d6 --- /dev/null +++ b/lib_test/unicode/expect/test_view.ml @@ -0,0 +1,6 @@ +open Import + +let%expect_test "view" = + let view = Re.View.view (Re.str "foo") in + ignore view +;; diff --git a/re.opam b/re.opam index cd0b27f2..ec389167 100644 --- a/re.opam +++ b/re.opam @@ -24,6 +24,9 @@ bug-reports: "https://github.com/ocaml/ocaml-re/issues" depends: [ "dune" {>= "3.15"} "ocaml" {>= "4.08.0"} + "uucp" + "uucd" {build} + "zip" {build} "ppx_expect" {with-test} "ounit2" {with-test} "js_of_ocaml" {with-test}