Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- (* returns (count_of_chars, length_in_bytes, option error) *)
- value (sc_ulen : SC.t char -> (int * int * option exn)) sc =
- let sc_len = SC.length sc in
- let get i = Char.code (SC.get sc i) in
- let rec loop ~ch ~i =
- if i = sc_len
- then
- (ch, i, None)
- else
- let byte = get i in
- if byte < 0x80
- then loop ~ch:(ch+1) ~i:(i+1)
- else if byte <= 0xBF
- then (ch, i, some & Bad_utf8 "head 0x80..0xBF")
- else if byte <= 0xC1
- then
- (if relaxed_utf8.val
- then skip_tail ~ch ~i ~sz:2
- else (ch, i, some & Bad_utf8 "head 0xC0..0xC1 (overlong)")
- )
- else if byte < 0xE0
- then skip_tail ~ch ~i ~sz:2
- else if byte < 0xF0
- then skip_tail ~ch ~i ~sz:3
- else if byte <= 0xF4
- then skip_tail ~ch ~i ~sz:4
- else (ch, i, some & Bad_utf8 "head 0xF5..0xFF")
- and skip_tail ~ch ~sz ~i = (* check len, then check_tail *)
- if i + sz > sc_len
- then (ch, i, None)
- else
- (if sz = 4 && not relaxed_utf8.val
- then check_tail4 (* check for codepoint too *)
- else check_tail ~len:(sz-1)
- ) ~i ~ch ~ifrom:(i+1)
- and check_tail ~i ~ch ~ifrom ~len = (* just check for 0b10xxxxxx *)
- if len = 0
- then loop ~ch:(ch+1) ~i:ifrom
- else
- let byte = get ifrom in
- if in_tail byte
- then check_tail ~i ~ch ~ifrom:(ifrom+1) ~len:(len-1)
- else (ch, i, bad_tail)
- and check_tail4 ~i ~ch ~ifrom = (* 0b10xxxxxx and codepoint *)
- let a = get i and b = get (i+1) and c = get (i+2) and d = get (i+3) in
- if not (in_tail b && in_tail c && in_tail d)
- then
- (ch, i, bad_tail)
- else
- let codepoint = decode_4bytes a b c d in
- if codepoint > 0x10FFFF
- then (ch, i, some & Bad_utf8 "codepoint > 0x10FFFF")
- else loop ~ch:(ch+1) ~i:(ifrom+4)
- in
- loop ~ch:0 ~i:0
- ;
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement