45d450aedb1f — Chris Cannam 2 years ago
Add isValidUtf8Prefix
1 files changed, 14 insertions(+), 3 deletions(-)

M decoder.sml
M decoder.sml +14 -3
@@ 10,9 10,17 @@ structure Utf8Decoder :> sig
         (word * word list -> word list) -> word list -> string
         -> word list
 
+    (* Return true if the given string is valid UTF-8, false
+       otherwise. *)
     val isValidUtf8 :
         string -> bool
         
+    (* Return true if the given string can be the start of a valid
+       UTF-8 string, i.e. if it is valid UTF-8 with the possible
+       exception that it may end in the middle of a codepoint. *)
+    val isValidUtf8Prefix :
+        string -> bool
+        
 end = struct
 
     val codepoint_limit = 0wx10ffff

          
@@ 115,7 123,7 @@ end = struct
               | (n, i, cp, result) => f (replacement, result)
         end
                      
-    fun isValidUtf8 s =
+    fun isValidUtf8' mayBePrefix s =
         let open Word
 	    infix 6 orb andb xorb <<
 

          
@@ 124,7 132,7 @@ end = struct
                recursive rather than a fold function *)
                     
             fun check [] (_, _, 0wx0) = true
-              | check [] (_, _, cp)   = false
+              | check [] (_, _, cp)   = mayBePrefix
               | check (char :: chars) (n, i, cp) =
                 let val w = Word.fromLargeWord
                                 (Word8.toLargeWord(Byte.charToByte char))

          
@@ 168,6 176,9 @@ end = struct
         in
             check (explode s) (0, 0, 0wx0)
         end
-            
+                           
+    val isValidUtf8 = isValidUtf8' false
+    val isValidUtf8Prefix = isValidUtf8' true
+      
 end