76f337555775 — Chris Cannam 2 years ago
Add isValidUtf8, add rejection of surrogate pairs, and update tests
9 files changed, 536 insertions(+), 282 deletions(-)

M .hgignore
M Makefile
M decoder.sml
M process.sml
A => testfiles/broken-check-expected.html
M testfiles/broken-expected.html
M testfiles/broken-input.html
R testfiles/broken-newspec.html => 
A => testfiles/broken-spec.html
M .hgignore +1 -0
@@ 1,4 1,5 @@ 
 syntax: glob
 process
 test-out.txt
+*.deps
 *~

          
M Makefile +8 -4
@@ 7,13 7,17 @@ BROKEN		:= testfiles/broken-input.html
 test:	process
 	@for t in ${TESTFILES} ; do \
 		./process $$t > test-out.txt ; \
-		if diff -u $$t test-out.txt ; then echo Test $$t succeeded ; \
-		else echo Test $$t failed ; \
+		if diff -u $$t test-out.txt ; then echo "Test $$t succeeded" ; \
+		else echo "Test $$t failed" ; \
 		fi ; \
 	done
+	@./process -c ${BROKEN} > test-out.txt
+	@if diff -u testfiles/broken-check-expected.html test-out.txt ; then echo "Test ${BROKEN} (check only) succeeded" ; \
+	else echo "Test ${BROKEN} (check only) failed" ; \
+	fi
 	@./process ${BROKEN} > test-out.txt
-	@if diff -u testfiles/broken-expected.html test-out.txt ; then echo Test ${BROKEN} succeeded ; \
-	else echo Test ${BROKEN} failed ; \
+	@if diff -u testfiles/broken-expected.html test-out.txt ; then echo "Test ${BROKEN} succeeded" ; \
+	else echo "Test ${BROKEN} failed" ; \
 	fi
 
 timing:	process testfiles/long.txt

          
M decoder.sml +63 -0
@@ 9,6 9,9 @@ structure Utf8Decoder :> sig
     val foldlString :
         (word * word list -> word list) -> word list -> string
         -> word list
+
+    val isValidUtf8 :
+        string -> bool
         
 end = struct
 

          
@@ 37,6 40,9 @@ end = struct
           | 3 => 0wx0800
           | 4 => 0wx10000
           | _ => 0wx0
+
+    val surrogate_start = 0wxd800
+    val surrogate_end   = 0wxdfff
                      
     fun foldlString f a s =
         let open Word

          
@@ 86,6 92,9 @@ end = struct
                                          (0, 0, 0wx0, f (replacement, a))
                                      else if cp > codepoint_limit then
                                          (0, 0, 0wx0, f (replacement, a))
+                                     else if cp >= surrogate_start andalso
+                                             cp <= surrogate_end then
+                                         (0, 0, 0wx0, f (replacement, a))
                                      else
                                          (0, 0, 0wx0, f (cp, a))
                                  end

          
@@ 105,6 114,60 @@ end = struct
                 (n, 0, 0wx0, result) => result
               | (n, i, cp, result) => f (replacement, result)
         end
+                     
+    fun isValidUtf8 s =
+        let open Word
+	    infix 6 orb andb xorb <<
+
+            (* Similar naming to "decode" within foldlString above.
+               See introductory comment there. This is directly tail-
+               recursive rather than a fold function *)
+                    
+            fun check [] (_, _, 0wx0) = true
+              | check [] (_, _, cp)   = false
+              | check (char :: chars) (n, i, cp) =
+                let val w = Word.fromLargeWord
+                                (Word8.toLargeWord(Byte.charToByte char))
+                    val check' = check chars
+                in
+                    case i of
+                        0 => if w andb b1_mask = 0wx0 then
+                                 check' (0, 0, 0wx0)
+                             else if w andb b2_mask = b2_marker then
+                                 check' (2, 1, w xorb b2_marker)
+                             else if w andb b3_mask = b3_marker then
+                                 check' (3, 2, w xorb b3_marker)
+                             else if w andb b4_mask = b4_marker then
+                                 check' (4, 3, w xorb b4_marker)
+                             else
+                                 false
+
+                      | 1 => if w andb bb_mask = bb_marker then
+                                 let val cp = (cp << 0w6) orb (w xorb bb_marker)
+                                 in
+                                     if cp < overlong n then
+                                         false
+                                     else if cp > codepoint_limit then
+                                         false
+                                     else if cp >= surrogate_start andalso
+                                             cp <= surrogate_end then
+                                         false
+                                     else
+                                         check' (0, 0, 0wx0)
+                                 end
+                             else
+                                 false
+
+                      | i => if w andb bb_mask = bb_marker then
+                                 let val cp = (cp << 0w6) orb (w xorb bb_marker)
+                                 in check' (n, Int.-(i, 1), cp)
+                                 end
+                             else
+                                 false
+                end
+        in
+            check (explode s) (0, 0, 0wx0)
+        end
             
 end
                         

          
M process.sml +12 -1
@@ 16,11 16,22 @@ fun process_file f =
         app_stream (print o process) stream;
         TextIO.closeIn stream
     end
+
+fun check_file f =
+    let val stream = TextIO.openIn f
+        fun check line = if Utf8Decoder.isValidUtf8 line
+                         then print ("    " ^ line)
+                         else print ("!!! " ^ line)
+    in
+        app_stream check stream;
+        TextIO.closeIn stream
+    end
         
 fun main () =
     case CommandLine.arguments () of
         [infile] => process_file infile
-      | _ => (TextIO.output (TextIO.stdErr, "Usage: process file.txt\n");
+      | ["-c", infile] => check_file infile
+      | _ => (TextIO.output (TextIO.stdErr, "Usage: process [-c] file.txt\n");
               raise Fail "Incorrect arguments specified")
 
                  

          
A => testfiles/broken-check-expected.html +178 -0
@@ 0,0 1,178 @@ 
+    <!DOCTYPE html>
+    <html>
+    <head>
+    	<meta charset="utf-8">
+    	<title>Broken UTF-8</title>
+    	<style>
+    	* {
+    		font-size: 1em;
+    	}
+    	</style>
+    </head>
+    <body>
+    <h1>Broken UTF-8</h1>
+    <p>Any copyright to this file is dedicated to the Public Domain. <a href=https://creativecommons.org/publicdomain/zero/1.0/>https://creativecommons.org/publicdomain/zero/1.0/</a></p>
+    <p>Five-byte and six-byte sequences were defined in <a href=https://tools.ietf.org/html/rfc2279>RFC 2297</a> but are no longer part of the UTF-8 definition.
+    <h2>Non-shortest forms for lowest single-byte (U+0000)</h2>
+    <dl>
+    <dt>Two-byte sequence (C0 80)</dt>
+!!! <dd></dd>
+    <dt>Three-byte sequence (E0 80 80)</dt>
+!!! <dd></dd>
+    <dt>Four-byte sequence (F0 80 80 80)</dt>
+!!! <dd></dd>
+    <dt>Five-byte sequence (F8 80 80 80 80)</dt>
+!!! <dd></dd>
+    <dt>Six-byte sequence (FC 80 80 80 80 80)</dt>
+!!! <dd></dd>
+    </dl>
+    <h2>Non-shortest forms for highest single-byte (U+007F)</h2>
+    <dl>
+    <dt>Two-byte sequence (C1 BF)</dt>
+!!! <dd></dd>
+    <dt>Three-byte sequence (E0 81 BF)</dt>
+!!! <dd></dd>
+    <dt>Four-byte sequence (F0 80 81 BF)</dt>
+!!! <dd></dd>
+    <dt>Five-byte sequence (F8 80 80 81 BF)</dt>
+!!! <dd></dd>
+    <dt>Six-byte sequence (FC 80 80 80 81 BF)</dt>
+!!! <dd></dd>
+    </dl>
+    <h2>Non-shortest forms for lowest two-byte (U+0080)</h2>
+    <dl>
+    <dt>Three-byte sequence (E0 82 80)</dt>
+!!! <dd></dd>
+    <dt>Four-byte sequence (F0 80 82 80)</dt>
+!!! <dd></dd>
+    <dt>Five-byte sequence (F8 80 80 82 80)</dt>
+!!! <dd></dd>
+    <dt>Six-byte sequence (FC 80 80 80 82 80)</dt>
+!!! <dd></dd>
+    </dl>
+    <h2>Non-shortest forms for highest two-byte (U+07FF)</h2>
+    <dl>
+    <dt>Three-byte sequence (E0 9F BF)</dt>
+!!! <dd></dd>
+    <dt>Four-byte sequence (F0 80 9F BF)</dt>
+!!! <dd></dd>
+    <dt>Five-byte sequence (F8 80 80 9F BF)</dt>
+!!! <dd></dd>
+    <dt>Six-byte sequence (FC 80 80 80 9F BF)</dt>
+!!! <dd></dd>
+    </dl>
+    <h2>Non-shortest forms for lowest three-byte (U+0800)</h2>
+    <dl>
+    <dt>Four-byte sequence (F0 80 A0 80)</dt>
+!!! <dd></dd>
+    <dt>Five-byte sequence (F8 80 80 A0 80)</dt>
+!!! <dd></dd>
+    <dt>Six-byte sequence (FC 80 80 80 A0 80)</dt>
+!!! <dd></dd>
+    </dl>
+    <h2>Non-shortest forms for highest three-byte (U+FFFF)</h2>
+    <dl>
+    <dt>Four-byte sequence (F0 8F BF BF)</dt>
+!!! <dd></dd>
+    <dt>Five-byte sequence (F8 80 8F BF BF)</dt>
+!!! <dd></dd>
+    <dt>Six-byte sequence (FC 80 80 8F BF BF)</dt>
+!!! <dd></dd>
+    </dl>
+    <h2>Non-shortest forms for lowest four-byte (U+10000)</h2>
+    <dl>
+    <dt>Five-byte sequence (F8 80 90 80 80)</dt>
+!!! <dd></dd>
+    <dt>Six-byte sequence (FC 80 80 90 80 80)</dt>
+!!! <dd></dd>
+    </dl>
+    <h2>Non-shortest forms for last Unicode (U+10FFFF)</h2>
+    <dl>
+    <dt>Five-byte sequence (F8 84 8F BF BF)</dt>
+!!! <dd></dd>
+    <dt>Six-byte sequence (FC 80 84 8F BF BF)</dt>
+!!! <dd></dd>
+    </dl>
+    <h2>Out of range</h2>
+    <dl>
+    <dt>One past Unicode (F4 90 80 80)</dt>
+!!! <dd></dd>
+    <dt>Longest five-byte sequence (FB BF BF BF BF)</dt>
+!!! <dd></dd>
+    <dt>Longest six-byte sequence (FD BF BF BF BF BF)</dt>
+!!! <dd></dd>
+    <dt>First surrogate (ED A0 80)</dt>
+!!! <dd></dd>
+    <dt>Last surrogate (ED BF BF)</dt>
+!!! <dd></dd>
+    <dt>CESU-8 surrogate pair (ED A0 BD ED B2 A9)</dt>
+!!! <dd></dd>
+    </dl>
+    <h2>Out of range and non-shortest</h2>
+    <dl>
+    <dt>One past Unicode as five-byte sequence (F8 84 90 80 80)</dt>
+!!! <dd></dd>
+    <dt>One past Unicode as six-byte sequence (FC 80 84 90 80 80)</dt>
+!!! <dd></dd>
+    <dt>First surrogate as four-byte sequence (F0 8D A0 80)</dt>
+!!! <dd></dd>
+    <dt>Last surrogate as four-byte sequence (F0 8D BF BF)</dt>
+!!! <dd></dd>
+    <dt>CESU-8 surrogate pair as two four-byte overlongs (F0 8D A0 BD F0 8D B2 A9)</dt>
+!!! <dd></dd>
+    </dl>
+    <h2>Lone trails</h2>
+    <dl>
+    <dt>One (80)</dt>
+!!! <dd></dd>
+    <dt>Two (80 80)</dt>
+!!! <dd></dd>
+    <dt>Three (80 80 80)</dt>
+!!! <dd></dd>
+    <dt>Four (80 80 80 80)</dt>
+!!! <dd></dd>
+    <dt>Five (80 80 80 80 80)</dt>
+!!! <dd></dd>
+    <dt>Six (80 80 80 80 80 80)</dt>
+!!! <dd></dd>
+    <dt>Seven (80 80 80 80 80 80 80)</dt>
+!!! <dd></dd>
+    <dt>After valid two-byte (C2 B6 80)</dt>
+!!! <dd>¶</dd>
+    <dt>After valid three-byte (E2 98 83 80)</dt>
+!!! <dd>☃</dd>
+    <dt>After valid four-byte (F0 9F 92 A9 80)</dt>
+!!! <dd>💩</dd>
+    <dt>After five-byte (FB BF BF BF BF 80)</dt>
+!!! <dd></dd>
+    <dt>After six-byte (FD BF BF BF BF BF 80)</dt>
+!!! <dd></dd>
+    </dl>
+    <h2>Truncated sequences</h2>
+    <dl>
+    <dt>Two-byte lead (C2)</dt>
+!!! <dd></dd>
+    <dt>Three-byte lead (E2)</dt>
+!!! <dd></dd>
+    <dt>Three-byte lead and one trail (E2 98)</dt>
+!!! <dd></dd>
+    <dt>Four-byte lead (F0)</dt>
+!!! <dd></dd>
+    <dt>Four-byte lead and one trail (F0 9F)</dt>
+!!! <dd></dd>
+    <dt>Four-byte lead and two trails (F0 9F 92)</dt>
+!!! <dd></dd>
+    </dl>
+    <h2>Leftovers</h2>
+    <dl>
+    <dt>FE (FE)</dt>
+!!! <dd></dd>
+    <dt>FE and trail (FE 80)</dt>
+!!! <dd></dd>
+    <dt>FF (FF)</dt>
+!!! <dd></dd>
+    <dt>FF and trail (FF 80)</dt>
+!!! <dd></dd>
+    </dl>
+    </body>
+    </html>

          
M testfiles/broken-expected.html +27 -28
@@ 1,6 1,6 @@ 
 <!DOCTYPE html>
-<html><head>
-<meta http-equiv="content-type" content="text/html; charset=UTF-8">
+<html>
+<head>
 	<meta charset="utf-8">
 	<title>Broken UTF-8</title>
 	<style>

          
@@ 11,16 11,16 @@ 
 </head>
 <body>
 <h1>Broken UTF-8</h1>
-<p>Any copyright to this file is dedicated to the Public Domain. <a href="https://creativecommons.org/publicdomain/zero/1.0/">https://creativecommons.org/publicdomain/zero/1.0/</a></p>
-<p>Five-byte and six-byte sequences were defined in <a href="https://tools.ietf.org/html/rfc2279">RFC 2297</a> but are no longer part of the UTF-8 definition.
-</p><h2>Non-shortest forms for lowest single-byte (U+0000)</h2>
+<p>Any copyright to this file is dedicated to the Public Domain. <a href=https://creativecommons.org/publicdomain/zero/1.0/>https://creativecommons.org/publicdomain/zero/1.0/</a></p>
+<p>Five-byte and six-byte sequences were defined in <a href=https://tools.ietf.org/html/rfc2279>RFC 2297</a> but are no longer part of the UTF-8 definition.
+<h2>Non-shortest forms for lowest single-byte (U+0000)</h2>
 <dl>
 <dt>Two-byte sequence (C0 80)</dt>
-<dd>��</dd>
+<dd>�</dd>
 <dt>Three-byte sequence (E0 80 80)</dt>
-<dd>���</dd>
+<dd>�</dd>
 <dt>Four-byte sequence (F0 80 80 80)</dt>
-<dd>����</dd>
+<dd>�</dd>
 <dt>Five-byte sequence (F8 80 80 80 80)</dt>
 <dd>�����</dd>
 <dt>Six-byte sequence (FC 80 80 80 80 80)</dt>

          
@@ 29,11 29,11 @@ 
 <h2>Non-shortest forms for highest single-byte (U+007F)</h2>
 <dl>
 <dt>Two-byte sequence (C1 BF)</dt>
-<dd>��</dd>
+<dd>�</dd>
 <dt>Three-byte sequence (E0 81 BF)</dt>
-<dd>���</dd>
+<dd>�</dd>
 <dt>Four-byte sequence (F0 80 81 BF)</dt>
-<dd>����</dd>
+<dd>�</dd>
 <dt>Five-byte sequence (F8 80 80 81 BF)</dt>
 <dd>�����</dd>
 <dt>Six-byte sequence (FC 80 80 80 81 BF)</dt>

          
@@ 42,9 42,9 @@ 
 <h2>Non-shortest forms for lowest two-byte (U+0080)</h2>
 <dl>
 <dt>Three-byte sequence (E0 82 80)</dt>
-<dd>���</dd>
+<dd>�</dd>
 <dt>Four-byte sequence (F0 80 82 80)</dt>
-<dd>����</dd>
+<dd>�</dd>
 <dt>Five-byte sequence (F8 80 80 82 80)</dt>
 <dd>�����</dd>
 <dt>Six-byte sequence (FC 80 80 80 82 80)</dt>

          
@@ 53,9 53,9 @@ 
 <h2>Non-shortest forms for highest two-byte (U+07FF)</h2>
 <dl>
 <dt>Three-byte sequence (E0 9F BF)</dt>
-<dd>���</dd>
+<dd>�</dd>
 <dt>Four-byte sequence (F0 80 9F BF)</dt>
-<dd>����</dd>
+<dd>�</dd>
 <dt>Five-byte sequence (F8 80 80 9F BF)</dt>
 <dd>�����</dd>
 <dt>Six-byte sequence (FC 80 80 80 9F BF)</dt>

          
@@ 64,7 64,7 @@ 
 <h2>Non-shortest forms for lowest three-byte (U+0800)</h2>
 <dl>
 <dt>Four-byte sequence (F0 80 A0 80)</dt>
-<dd>����</dd>
+<dd>�</dd>
 <dt>Five-byte sequence (F8 80 80 A0 80)</dt>
 <dd>�����</dd>
 <dt>Six-byte sequence (FC 80 80 80 A0 80)</dt>

          
@@ 73,7 73,7 @@ 
 <h2>Non-shortest forms for highest three-byte (U+FFFF)</h2>
 <dl>
 <dt>Four-byte sequence (F0 8F BF BF)</dt>
-<dd>����</dd>
+<dd>�</dd>
 <dt>Five-byte sequence (F8 80 8F BF BF)</dt>
 <dd>�����</dd>
 <dt>Six-byte sequence (FC 80 80 8F BF BF)</dt>

          
@@ 96,17 96,17 @@ 
 <h2>Out of range</h2>
 <dl>
 <dt>One past Unicode (F4 90 80 80)</dt>
-<dd>����</dd>
+<dd>�</dd>
 <dt>Longest five-byte sequence (FB BF BF BF BF)</dt>
 <dd>�����</dd>
 <dt>Longest six-byte sequence (FD BF BF BF BF BF)</dt>
 <dd>������</dd>
 <dt>First surrogate (ED A0 80)</dt>
-<dd>���</dd>
+<dd>�</dd>
 <dt>Last surrogate (ED BF BF)</dt>
-<dd>���</dd>
+<dd>�</dd>
 <dt>CESU-8 surrogate pair (ED A0 BD ED B2 A9)</dt>
-<dd>������</dd>
+<dd>��</dd>
 </dl>
 <h2>Out of range and non-shortest</h2>
 <dl>

          
@@ 115,11 115,11 @@ 
 <dt>One past Unicode as six-byte sequence (FC 80 84 90 80 80)</dt>
 <dd>������</dd>
 <dt>First surrogate as four-byte sequence (F0 8D A0 80)</dt>
-<dd>����</dd>
+<dd>�</dd>
 <dt>Last surrogate as four-byte sequence (F0 8D BF BF)</dt>
-<dd>����</dd>
+<dd>�</dd>
 <dt>CESU-8 surrogate pair as two four-byte overlongs (F0 8D A0 BD F0 8D B2 A9)</dt>
-<dd>��������</dd>
+<dd>��</dd>
 </dl>
 <h2>Lone trails</h2>
 <dl>

          
@@ 133,7 133,7 @@ 
 <dd>����</dd>
 <dt>Five (80 80 80 80 80)</dt>
 <dd>�����</dd>
-<dt>Six (80 80 80 80 80)</dt>
+<dt>Six (80 80 80 80 80 80)</dt>
 <dd>������</dd>
 <dt>Seven (80 80 80 80 80 80 80)</dt>
 <dd>�������</dd>

          
@@ 174,6 174,5 @@ 
 <dt>FF and trail (FF 80)</dt>
 <dd>��</dd>
 </dl>
-
-
-</body></html>
+</body>
+</html>

          
M testfiles/broken-input.html +69 -70
@@ 1,6 1,6 @@ 
 <!DOCTYPE html>
-<html><head>
-<meta http-equiv="content-type" content="text/html; charset=UTF-8">
+<html>
+<head>
 	<meta charset="utf-8">
 	<title>Broken UTF-8</title>
 	<style>

          
@@ 11,169 11,168 @@ 
 </head>
 <body>
 <h1>Broken UTF-8</h1>
-<p>Any copyright to this file is dedicated to the Public Domain. <a href="https://creativecommons.org/publicdomain/zero/1.0/">https://creativecommons.org/publicdomain/zero/1.0/</a></p>
-<p>Five-byte and six-byte sequences were defined in <a href="https://tools.ietf.org/html/rfc2279">RFC 2297</a> but are no longer part of the UTF-8 definition.
-</p><h2>Non-shortest forms for lowest single-byte (U+0000)</h2>
+<p>Any copyright to this file is dedicated to the Public Domain. <a href=https://creativecommons.org/publicdomain/zero/1.0/>https://creativecommons.org/publicdomain/zero/1.0/</a></p>
+<p>Five-byte and six-byte sequences were defined in <a href=https://tools.ietf.org/html/rfc2279>RFC 2297</a> but are no longer part of the UTF-8 definition.
+<h2>Non-shortest forms for lowest single-byte (U+0000)</h2>
 <dl>
 <dt>Two-byte sequence (C0 80)</dt>
-<dd>��</dd>
+<dd></dd>
 <dt>Three-byte sequence (E0 80 80)</dt>
-<dd>���</dd>
+<dd></dd>
 <dt>Four-byte sequence (F0 80 80 80)</dt>
-<dd>����</dd>
+<dd></dd>
 <dt>Five-byte sequence (F8 80 80 80 80)</dt>
-<dd>�����</dd>
+<dd></dd>
 <dt>Six-byte sequence (FC 80 80 80 80 80)</dt>
-<dd>������</dd>
+<dd></dd>
 </dl>
 <h2>Non-shortest forms for highest single-byte (U+007F)</h2>
 <dl>
 <dt>Two-byte sequence (C1 BF)</dt>
-<dd>��</dd>
+<dd></dd>
 <dt>Three-byte sequence (E0 81 BF)</dt>
-<dd>���</dd>
+<dd></dd>
 <dt>Four-byte sequence (F0 80 81 BF)</dt>
-<dd>����</dd>
+<dd></dd>
 <dt>Five-byte sequence (F8 80 80 81 BF)</dt>
-<dd>�����</dd>
+<dd></dd>
 <dt>Six-byte sequence (FC 80 80 80 81 BF)</dt>
-<dd>������</dd>
+<dd></dd>
 </dl>
 <h2>Non-shortest forms for lowest two-byte (U+0080)</h2>
 <dl>
 <dt>Three-byte sequence (E0 82 80)</dt>
-<dd>���</dd>
+<dd></dd>
 <dt>Four-byte sequence (F0 80 82 80)</dt>
-<dd>����</dd>
+<dd></dd>
 <dt>Five-byte sequence (F8 80 80 82 80)</dt>
-<dd>�����</dd>
+<dd></dd>
 <dt>Six-byte sequence (FC 80 80 80 82 80)</dt>
-<dd>������</dd>
+<dd></dd>
 </dl>
 <h2>Non-shortest forms for highest two-byte (U+07FF)</h2>
 <dl>
 <dt>Three-byte sequence (E0 9F BF)</dt>
-<dd>���</dd>
+<dd></dd>
 <dt>Four-byte sequence (F0 80 9F BF)</dt>
-<dd>����</dd>
+<dd></dd>
 <dt>Five-byte sequence (F8 80 80 9F BF)</dt>
-<dd>�����</dd>
+<dd></dd>
 <dt>Six-byte sequence (FC 80 80 80 9F BF)</dt>
-<dd>������</dd>
+<dd></dd>
 </dl>
 <h2>Non-shortest forms for lowest three-byte (U+0800)</h2>
 <dl>
 <dt>Four-byte sequence (F0 80 A0 80)</dt>
-<dd>����</dd>
+<dd></dd>
 <dt>Five-byte sequence (F8 80 80 A0 80)</dt>
-<dd>�����</dd>
+<dd></dd>
 <dt>Six-byte sequence (FC 80 80 80 A0 80)</dt>
-<dd>������</dd>
+<dd></dd>
 </dl>
 <h2>Non-shortest forms for highest three-byte (U+FFFF)</h2>
 <dl>
 <dt>Four-byte sequence (F0 8F BF BF)</dt>
-<dd>����</dd>
+<dd></dd>
 <dt>Five-byte sequence (F8 80 8F BF BF)</dt>
-<dd>�����</dd>
+<dd></dd>
 <dt>Six-byte sequence (FC 80 80 8F BF BF)</dt>
-<dd>������</dd>
+<dd></dd>
 </dl>
 <h2>Non-shortest forms for lowest four-byte (U+10000)</h2>
 <dl>
 <dt>Five-byte sequence (F8 80 90 80 80)</dt>
-<dd>�����</dd>
+<dd></dd>
 <dt>Six-byte sequence (FC 80 80 90 80 80)</dt>
-<dd>������</dd>
+<dd></dd>
 </dl>
 <h2>Non-shortest forms for last Unicode (U+10FFFF)</h2>
 <dl>
 <dt>Five-byte sequence (F8 84 8F BF BF)</dt>
-<dd>�����</dd>
+<dd></dd>
 <dt>Six-byte sequence (FC 80 84 8F BF BF)</dt>
-<dd>������</dd>
+<dd></dd>
 </dl>
 <h2>Out of range</h2>
 <dl>
 <dt>One past Unicode (F4 90 80 80)</dt>
-<dd>����</dd>
+<dd></dd>
 <dt>Longest five-byte sequence (FB BF BF BF BF)</dt>
-<dd>�����</dd>
+<dd></dd>
 <dt>Longest six-byte sequence (FD BF BF BF BF BF)</dt>
-<dd>������</dd>
+<dd></dd>
 <dt>First surrogate (ED A0 80)</dt>
-<dd>���</dd>
+<dd></dd>
 <dt>Last surrogate (ED BF BF)</dt>
-<dd>���</dd>
+<dd></dd>
 <dt>CESU-8 surrogate pair (ED A0 BD ED B2 A9)</dt>
-<dd>������</dd>
+<dd></dd>
 </dl>
 <h2>Out of range and non-shortest</h2>
 <dl>
 <dt>One past Unicode as five-byte sequence (F8 84 90 80 80)</dt>
-<dd>�����</dd>
+<dd></dd>
 <dt>One past Unicode as six-byte sequence (FC 80 84 90 80 80)</dt>
-<dd>������</dd>
+<dd></dd>
 <dt>First surrogate as four-byte sequence (F0 8D A0 80)</dt>
-<dd>����</dd>
+<dd></dd>
 <dt>Last surrogate as four-byte sequence (F0 8D BF BF)</dt>
-<dd>����</dd>
+<dd></dd>
 <dt>CESU-8 surrogate pair as two four-byte overlongs (F0 8D A0 BD F0 8D B2 A9)</dt>
-<dd>��������</dd>
+<dd></dd>
 </dl>
 <h2>Lone trails</h2>
 <dl>
 <dt>One (80)</dt>
-<dd>�</dd>
+<dd></dd>
 <dt>Two (80 80)</dt>
-<dd>��</dd>
+<dd></dd>
 <dt>Three (80 80 80)</dt>
-<dd>���</dd>
+<dd></dd>
 <dt>Four (80 80 80 80)</dt>
-<dd>����</dd>
+<dd></dd>
 <dt>Five (80 80 80 80 80)</dt>
-<dd>�����</dd>
-<dt>Six (80 80 80 80 80)</dt>
-<dd>������</dd>
+<dd></dd>
+<dt>Six (80 80 80 80 80 80)</dt>
+<dd></dd>
 <dt>Seven (80 80 80 80 80 80 80)</dt>
-<dd>�������</dd>
+<dd></dd>
 <dt>After valid two-byte (C2 B6 80)</dt>
-<dd>¶�</dd>
+<dd>¶</dd>
 <dt>After valid three-byte (E2 98 83 80)</dt>
-<dd>☃�</dd>
+<dd>☃</dd>
 <dt>After valid four-byte (F0 9F 92 A9 80)</dt>
-<dd>💩�</dd>
+<dd>💩</dd>
 <dt>After five-byte (FB BF BF BF BF 80)</dt>
-<dd>������</dd>
+<dd></dd>
 <dt>After six-byte (FD BF BF BF BF BF 80)</dt>
-<dd>�������</dd>
+<dd></dd>
 </dl>
 <h2>Truncated sequences</h2>
 <dl>
 <dt>Two-byte lead (C2)</dt>
-<dd>�</dd>
+<dd></dd>
 <dt>Three-byte lead (E2)</dt>
-<dd>�</dd>
+<dd></dd>
 <dt>Three-byte lead and one trail (E2 98)</dt>
-<dd>�</dd>
+<dd></dd>
 <dt>Four-byte lead (F0)</dt>
-<dd>�</dd>
+<dd></dd>
 <dt>Four-byte lead and one trail (F0 9F)</dt>
-<dd>�</dd>
+<dd></dd>
 <dt>Four-byte lead and two trails (F0 9F 92)</dt>
-<dd>�</dd>
+<dd></dd>
 </dl>
 <h2>Leftovers</h2>
 <dl>
 <dt>FE (FE)</dt>
-<dd>�</dd>
+<dd></dd>
 <dt>FE and trail (FE 80)</dt>
-<dd>��</dd>
+<dd></dd>
 <dt>FF (FF)</dt>
-<dd>�</dd>
+<dd></dd>
 <dt>FF and trail (FF 80)</dt>
-<dd>��</dd>
+<dd></dd>
 </dl>
-
-
-</body></html>
+</body>
+</html>

          
R testfiles/broken-newspec.html =>  +0 -179
@@ 1,179 0,0 @@ 
-<!DOCTYPE html>
-<html><head>
-<meta http-equiv="content-type" content="text/html; charset=UTF-8">
-	<meta charset="utf-8">
-	<title>Broken UTF-8</title>
-	<style>
-	* {
-		font-size: 1em;
-	}
-	</style>
-<style>@media print {#ghostery-purple-box {display:none !important}}</style></head>
-<body>
-<h1>Broken UTF-8</h1>
-<p>Any copyright to this file is dedicated to the Public Domain. <a href="https://creativecommons.org/publicdomain/zero/1.0/">https://creativecommons.org/publicdomain/zero/1.0/</a></p>
-<p>Five-byte and six-byte sequences were defined in <a href="https://tools.ietf.org/html/rfc2279">RFC 2297</a> but are no longer part of the UTF-8 definition.
-</p><h2>Non-shortest forms for lowest single-byte (U+0000)</h2>
-<dl>
-<dt>Two-byte sequence (C0 80)</dt>
-<dd>�</dd>
-<dt>Three-byte sequence (E0 80 80)</dt>
-<dd>�</dd>
-<dt>Four-byte sequence (F0 80 80 80)</dt>
-<dd>�</dd>
-<dt>Five-byte sequence (F8 80 80 80 80)</dt>
-<dd>�</dd>
-<dt>Six-byte sequence (FC 80 80 80 80 80)</dt>
-<dd>�</dd>
-</dl>
-<h2>Non-shortest forms for highest single-byte (U+007F)</h2>
-<dl>
-<dt>Two-byte sequence (C1 BF)</dt>
-<dd>�</dd>
-<dt>Three-byte sequence (E0 81 BF)</dt>
-<dd>�</dd>
-<dt>Four-byte sequence (F0 80 81 BF)</dt>
-<dd>�</dd>
-<dt>Five-byte sequence (F8 80 80 81 BF)</dt>
-<dd>�</dd>
-<dt>Six-byte sequence (FC 80 80 80 81 BF)</dt>
-<dd>�</dd>
-</dl>
-<h2>Non-shortest forms for lowest two-byte (U+0080)</h2>
-<dl>
-<dt>Three-byte sequence (E0 82 80)</dt>
-<dd>�</dd>
-<dt>Four-byte sequence (F0 80 82 80)</dt>
-<dd>�</dd>
-<dt>Five-byte sequence (F8 80 80 82 80)</dt>
-<dd>�</dd>
-<dt>Six-byte sequence (FC 80 80 80 82 80)</dt>
-<dd>�</dd>
-</dl>
-<h2>Non-shortest forms for highest two-byte (U+07FF)</h2>
-<dl>
-<dt>Three-byte sequence (E0 9F BF)</dt>
-<dd>�</dd>
-<dt>Four-byte sequence (F0 80 9F BF)</dt>
-<dd>�</dd>
-<dt>Five-byte sequence (F8 80 80 9F BF)</dt>
-<dd>�</dd>
-<dt>Six-byte sequence (FC 80 80 80 9F BF)</dt>
-<dd>�</dd>
-</dl>
-<h2>Non-shortest forms for lowest three-byte (U+0800)</h2>
-<dl>
-<dt>Four-byte sequence (F0 80 A0 80)</dt>
-<dd>�</dd>
-<dt>Five-byte sequence (F8 80 80 A0 80)</dt>
-<dd>�</dd>
-<dt>Six-byte sequence (FC 80 80 80 A0 80)</dt>
-<dd>�</dd>
-</dl>
-<h2>Non-shortest forms for highest three-byte (U+FFFF)</h2>
-<dl>
-<dt>Four-byte sequence (F0 8F BF BF)</dt>
-<dd>�</dd>
-<dt>Five-byte sequence (F8 80 8F BF BF)</dt>
-<dd>�</dd>
-<dt>Six-byte sequence (FC 80 80 8F BF BF)</dt>
-<dd>�</dd>
-</dl>
-<h2>Non-shortest forms for lowest four-byte (U+10000)</h2>
-<dl>
-<dt>Five-byte sequence (F8 80 90 80 80)</dt>
-<dd>�</dd>
-<dt>Six-byte sequence (FC 80 80 90 80 80)</dt>
-<dd>�</dd>
-</dl>
-<h2>Non-shortest forms for last Unicode (U+10FFFF)</h2>
-<dl>
-<dt>Five-byte sequence (F8 84 8F BF BF)</dt>
-<dd>�</dd>
-<dt>Six-byte sequence (FC 80 84 8F BF BF)</dt>
-<dd>�</dd>
-</dl>
-<h2>Out of range</h2>
-<dl>
-<dt>One past Unicode (F4 90 80 80)</dt>
-<dd>�</dd>
-<dt>Longest five-byte sequence (FB BF BF BF BF)</dt>
-<dd>�</dd>
-<dt>Longest six-byte sequence (FD BF BF BF BF BF)</dt>
-<dd>�</dd>
-<dt>First surrogate (ED A0 80)</dt>
-<dd>�</dd>
-<dt>Last surrogate (ED BF BF)</dt>
-<dd>�</dd>
-<dt>CESU-8 surrogate pair (ED A0 BD ED B2 A9)</dt>
-<dd>��</dd>
-</dl>
-<h2>Out of range and non-shortest</h2>
-<dl>
-<dt>One past Unicode as five-byte sequence (F8 84 90 80 80)</dt>
-<dd>�</dd>
-<dt>One past Unicode as six-byte sequence (FC 80 84 90 80 80)</dt>
-<dd>�</dd>
-<dt>First surrogate as four-byte sequence (F0 8D A0 80)</dt>
-<dd>�</dd>
-<dt>Last surrogate as four-byte sequence (F0 8D BF BF)</dt>
-<dd>�</dd>
-<dt>CESU-8 surrogate pair as two four-byte overlongs (F0 8D A0 BD F0 8D B2 A9)</dt>
-<dd>��</dd>
-</dl>
-<h2>Lone trails</h2>
-<dl>
-<dt>One (80)</dt>
-<dd>�</dd>
-<dt>Two (80 80)</dt>
-<dd>��</dd>
-<dt>Three (80 80 80)</dt>
-<dd>���</dd>
-<dt>Four (80 80 80 80)</dt>
-<dd>����</dd>
-<dt>Five (80 80 80 80 80)</dt>
-<dd>�����</dd>
-<dt>Six (80 80 80 80 80)</dt>
-<dd>������</dd>
-<dt>Seven (80 80 80 80 80 80 80)</dt>
-<dd>�������</dd>
-<dt>After valid two-byte (C2 B6 80)</dt>
-<dd>¶�</dd>
-<dt>After valid three-byte (E2 98 83 80)</dt>
-<dd>☃�</dd>
-<dt>After valid four-byte (F0 9F 92 A9 80)</dt>
-<dd>💩�</dd>
-<dt>After five-byte (FB BF BF BF BF 80)</dt>
-<dd>��</dd>
-<dt>After six-byte (FD BF BF BF BF BF 80)</dt>
-<dd>��</dd>
-</dl>
-<h2>Truncated sequences</h2>
-<dl>
-<dt>Two-byte lead (C2)</dt>
-<dd>�</dd>
-<dt>Three-byte lead (E2)</dt>
-<dd>�</dd>
-<dt>Three-byte lead and one trail (E2 98)</dt>
-<dd>�</dd>
-<dt>Four-byte lead (F0)</dt>
-<dd>�</dd>
-<dt>Four-byte lead and one trail (F0 9F)</dt>
-<dd>�</dd>
-<dt>Four-byte lead and two trails (F0 9F 92)</dt>
-<dd>�</dd>
-</dl>
-<h2>Leftovers</h2>
-<dl>
-<dt>FE (FE)</dt>
-<dd>�</dd>
-<dt>FE and trail (FE 80)</dt>
-<dd>��</dd>
-<dt>FF (FF)</dt>
-<dd>�</dd>
-<dt>FF and trail (FF 80)</dt>
-<dd>��</dd>
-</dl>
-
-
-</body></html>
  No newline at end of file

          
A => testfiles/broken-spec.html +178 -0
@@ 0,0 1,178 @@ 
+<!DOCTYPE html>
+<html>
+<head>
+	<meta charset="utf-8">
+	<title>Broken UTF-8</title>
+	<style>
+	* {
+		font-size: 1em;
+	}
+	</style>
+</head>
+<body>
+<h1>Broken UTF-8</h1>
+<p>Any copyright to this file is dedicated to the Public Domain. <a href=https://creativecommons.org/publicdomain/zero/1.0/>https://creativecommons.org/publicdomain/zero/1.0/</a></p>
+<p>Five-byte and six-byte sequences were defined in <a href=https://tools.ietf.org/html/rfc2279>RFC 2297</a> but are no longer part of the UTF-8 definition.
+<h2>Non-shortest forms for lowest single-byte (U+0000)</h2>
+<dl>
+<dt>Two-byte sequence (C0 80)</dt>
+<dd>��</dd>
+<dt>Three-byte sequence (E0 80 80)</dt>
+<dd>���</dd>
+<dt>Four-byte sequence (F0 80 80 80)</dt>
+<dd>����</dd>
+<dt>Five-byte sequence (F8 80 80 80 80)</dt>
+<dd>�����</dd>
+<dt>Six-byte sequence (FC 80 80 80 80 80)</dt>
+<dd>������</dd>
+</dl>
+<h2>Non-shortest forms for highest single-byte (U+007F)</h2>
+<dl>
+<dt>Two-byte sequence (C1 BF)</dt>
+<dd>��</dd>
+<dt>Three-byte sequence (E0 81 BF)</dt>
+<dd>���</dd>
+<dt>Four-byte sequence (F0 80 81 BF)</dt>
+<dd>����</dd>
+<dt>Five-byte sequence (F8 80 80 81 BF)</dt>
+<dd>�����</dd>
+<dt>Six-byte sequence (FC 80 80 80 81 BF)</dt>
+<dd>������</dd>
+</dl>
+<h2>Non-shortest forms for lowest two-byte (U+0080)</h2>
+<dl>
+<dt>Three-byte sequence (E0 82 80)</dt>
+<dd>���</dd>
+<dt>Four-byte sequence (F0 80 82 80)</dt>
+<dd>����</dd>
+<dt>Five-byte sequence (F8 80 80 82 80)</dt>
+<dd>�����</dd>
+<dt>Six-byte sequence (FC 80 80 80 82 80)</dt>
+<dd>������</dd>
+</dl>
+<h2>Non-shortest forms for highest two-byte (U+07FF)</h2>
+<dl>
+<dt>Three-byte sequence (E0 9F BF)</dt>
+<dd>���</dd>
+<dt>Four-byte sequence (F0 80 9F BF)</dt>
+<dd>����</dd>
+<dt>Five-byte sequence (F8 80 80 9F BF)</dt>
+<dd>�����</dd>
+<dt>Six-byte sequence (FC 80 80 80 9F BF)</dt>
+<dd>������</dd>
+</dl>
+<h2>Non-shortest forms for lowest three-byte (U+0800)</h2>
+<dl>
+<dt>Four-byte sequence (F0 80 A0 80)</dt>
+<dd>����</dd>
+<dt>Five-byte sequence (F8 80 80 A0 80)</dt>
+<dd>�����</dd>
+<dt>Six-byte sequence (FC 80 80 80 A0 80)</dt>
+<dd>������</dd>
+</dl>
+<h2>Non-shortest forms for highest three-byte (U+FFFF)</h2>
+<dl>
+<dt>Four-byte sequence (F0 8F BF BF)</dt>
+<dd>����</dd>
+<dt>Five-byte sequence (F8 80 8F BF BF)</dt>
+<dd>�����</dd>
+<dt>Six-byte sequence (FC 80 80 8F BF BF)</dt>
+<dd>������</dd>
+</dl>
+<h2>Non-shortest forms for lowest four-byte (U+10000)</h2>
+<dl>
+<dt>Five-byte sequence (F8 80 90 80 80)</dt>
+<dd>�����</dd>
+<dt>Six-byte sequence (FC 80 80 90 80 80)</dt>
+<dd>������</dd>
+</dl>
+<h2>Non-shortest forms for last Unicode (U+10FFFF)</h2>
+<dl>
+<dt>Five-byte sequence (F8 84 8F BF BF)</dt>
+<dd>�����</dd>
+<dt>Six-byte sequence (FC 80 84 8F BF BF)</dt>
+<dd>������</dd>
+</dl>
+<h2>Out of range</h2>
+<dl>
+<dt>One past Unicode (F4 90 80 80)</dt>
+<dd>����</dd>
+<dt>Longest five-byte sequence (FB BF BF BF BF)</dt>
+<dd>�����</dd>
+<dt>Longest six-byte sequence (FD BF BF BF BF BF)</dt>
+<dd>������</dd>
+<dt>First surrogate (ED A0 80)</dt>
+<dd>���</dd>
+<dt>Last surrogate (ED BF BF)</dt>
+<dd>���</dd>
+<dt>CESU-8 surrogate pair (ED A0 BD ED B2 A9)</dt>
+<dd>������</dd>
+</dl>
+<h2>Out of range and non-shortest</h2>
+<dl>
+<dt>One past Unicode as five-byte sequence (F8 84 90 80 80)</dt>
+<dd>�����</dd>
+<dt>One past Unicode as six-byte sequence (FC 80 84 90 80 80)</dt>
+<dd>������</dd>
+<dt>First surrogate as four-byte sequence (F0 8D A0 80)</dt>
+<dd>����</dd>
+<dt>Last surrogate as four-byte sequence (F0 8D BF BF)</dt>
+<dd>����</dd>
+<dt>CESU-8 surrogate pair as two four-byte overlongs (F0 8D A0 BD F0 8D B2 A9)</dt>
+<dd>��������</dd>
+</dl>
+<h2>Lone trails</h2>
+<dl>
+<dt>One (80)</dt>
+<dd>�</dd>
+<dt>Two (80 80)</dt>
+<dd>��</dd>
+<dt>Three (80 80 80)</dt>
+<dd>���</dd>
+<dt>Four (80 80 80 80)</dt>
+<dd>����</dd>
+<dt>Five (80 80 80 80 80)</dt>
+<dd>�����</dd>
+<dt>Six (80 80 80 80 80 80)</dt>
+<dd>������</dd>
+<dt>Seven (80 80 80 80 80 80 80)</dt>
+<dd>�������</dd>
+<dt>After valid two-byte (C2 B6 80)</dt>
+<dd>¶�</dd>
+<dt>After valid three-byte (E2 98 83 80)</dt>
+<dd>☃�</dd>
+<dt>After valid four-byte (F0 9F 92 A9 80)</dt>
+<dd>💩�</dd>
+<dt>After five-byte (FB BF BF BF BF 80)</dt>
+<dd>������</dd>
+<dt>After six-byte (FD BF BF BF BF BF 80)</dt>
+<dd>�������</dd>
+</dl>
+<h2>Truncated sequences</h2>
+<dl>
+<dt>Two-byte lead (C2)</dt>
+<dd>�</dd>
+<dt>Three-byte lead (E2)</dt>
+<dd>�</dd>
+<dt>Three-byte lead and one trail (E2 98)</dt>
+<dd>�</dd>
+<dt>Four-byte lead (F0)</dt>
+<dd>�</dd>
+<dt>Four-byte lead and one trail (F0 9F)</dt>
+<dd>�</dd>
+<dt>Four-byte lead and two trails (F0 9F 92)</dt>
+<dd>�</dd>
+</dl>
+<h2>Leftovers</h2>
+<dl>
+<dt>FE (FE)</dt>
+<dd>�</dd>
+<dt>FE and trail (FE 80)</dt>
+<dd>��</dd>
+<dt>FF (FF)</dt>
+<dd>�</dd>
+<dt>FF and trail (FF 80)</dt>
+<dd>��</dd>
+</dl>
+</body>
+</html>