Finally add entity support; consequently the encoding is now definitively UTF-8, not just "ASCII-compatible"
M .hgignore +1 -0
@@ 3,3 3,4 @@ syntax: glob
 *.log
 subxmlparse
 out.xml
+testfiles/tmp

          
M README.md +4 -5
@@ 13,16 13,15 @@ The format supported by SubXml consists 
 text, CDATA, and comment syntax from XML. It differs from XML in the
 following ways:
 
- * The document is assumed to be in an 8-bit "ASCII-compatible" format
-   such as UTF-8; UTF-16 is not supported
+ * The only supported character encoding is UTF-8
 
  * Processing instructions (<? ... ?>) are ignored
 
  * DOCTYPE declarations are ignored; all other declarations (<! ... >)
    are rejected except for CDATA, which is handled properly
 
- * Character and entity references (&-escapes) have no special status
-   and are just passed through literally
+ * Comments appearing before or after the root element are ignored;
+   other comments are included in the parsed tree
 
 Note that although the parser is limited, it is not forgiving --
 anything it can't understand is rejected with a clear error

          
@@ 34,6 33,6 @@ An equally simplistic serialiser is also
 For a proper XML parser in SML, consider fxp, mirrored at
 https://github.com/cannam/fxp.
 
-Copyright 2018 Chris Cannam.
+Copyright 2018-2021 Chris Cannam.
 MIT/X11 licence. See the file COPYING for details.
 

          
M subxml.sml +88 -10
@@ 2,7 2,7 @@ 
 (* SubXml - A parser for a subset of XML
    =====================================
 
-   https://bitbucket.org/cannam/sml-subxml
+   https://hg.sr.ht/~cannam/sml-subxml
 
    SubXml is a parser and serialiser for a format resembling XML. It
    can be used as a minimal parser for small XML configuration or

          
@@ 13,16 13,18 @@ 
    text, CDATA, and comment syntax from XML. It differs from XML in
    the following ways:
 
-   * The document is assumed to be in an 8-bit "ASCII-compatible"
-     format such as UTF-8; UTF-16 is not supported
+   * The only supported character encoding is UTF-8
 
    * Processing instructions (<? ... ?>) are ignored
 
    * DOCTYPE declarations are ignored; all other declarations (<!
      ... >) are rejected except for CDATA, which is handled properly
 
-   * Character and entity references (&-escapes) have no special
-     status and are just passed through literally
+   * It follows from the above that only built-in named entities and
+     character entities are decoded (but those ones are)
+
+   * Comments appearing before or after the root element are ignored;
+     other comments are included in the parsed tree
 
    Note that although the parser is limited, it is not forgiving --
    anything it can't understand is rejected with a clear error

          
@@ 32,7 34,7 @@ 
 
    An equally simplistic serialiser is also provided.
    
-   Copyright 2018 Chris Cannam.
+   Copyright 2018-2021 Chris Cannam.
 
    Permission is hereby granted, free of charge, to any person
    obtaining a copy of this software and associated documentation

          
@@ 136,7 138,66 @@ structure SubXml :> SUBXML = struct
         fun tokenError pos token =
             error pos ("Unexpected token '" ^ Char.toString token ^ "'")
 
-        val nameEnd = explode " \t\n\r\"'</>!=?"
+        val nameEnd = explode " \t\n\r\"'</>!=?&"
+
+        fun numChar n =
+            let open Word
+                infix 6 orb andb >>
+                fun chars ww = SOME (map (Char.chr o toInt) ww)
+                val c = fromInt n
+            in
+                if c < 0wx80 then
+                    chars [c]
+                else if c < 0wx800 then
+                    chars [0wxc0 orb (c >> 0w6),
+                           0wx80 orb (c andb 0wx3f)]
+                else if c < 0wx10000 then
+                    chars [0wxe0 orb (c >> 0w12),
+                           0wx80 orb ((c >> 0w6) andb 0wx3f),
+                           0wx80 orb (c andb 0wx3f)]
+                else if c < 0wx10ffff then
+	            chars [0wxf0 orb (c >> 0w18),
+	                   0wx80 orb ((c >> 0w12) andb 0wx3f),
+                           0wx80 orb ((c >> 0w6) andb 0wx3f),
+                           0wx80 orb (c andb 0wx3f)]
+                else NONE
+            end
+
+        fun hexChar h =
+            Option.mapPartial numChar
+                              (StringCvt.scanString (Int.scan StringCvt.HEX) h)
+
+        fun decChar d =
+            Option.mapPartial numChar
+                              (Int.fromString d)
+                
+        fun entity pos cc =
+            let fun entity' decoder pos text [] =
+                    error pos "Document ends during hex character entity"
+                  | entity' decoder pos text (c :: rest) =
+                    if c <> #";"
+                    then entity' decoder (pos+1) (c :: text) rest
+                    else case decoder (implode (rev text)) of
+                             NONE => error pos "Invalid character entity"
+                           | SOME chars => OK (chars, rest, pos+1)
+            in
+                case cc of
+                    #"q" :: #"u" :: #"o" :: #"t" :: #";" :: rest =>
+                    OK ([#"\""], rest, pos+5)
+                  | #"a" :: #"m" :: #"p" :: #";" :: rest =>
+                    OK ([#"&"], rest, pos+4)
+                  | #"a" :: #"p" :: #"o" :: #"s" :: #";" :: rest =>
+                    OK ([#"'"], rest, pos+5)
+                  | #"l" :: #"t" :: #";" :: rest =>
+                    OK ([#"<"], rest, pos+3)
+                  | #"g" :: #"t" :: #";" :: rest => 
+                    OK ([#">"], rest, pos+3)
+                  | #"#" :: #"x" :: rest =>
+                    entity' hexChar (pos+2) [] rest
+                  | #"#" :: rest =>
+                    entity' decChar (pos+1) [] rest
+                  | _ => error pos "Invalid entity"
+            end
                               
         fun quoted quote pos acc cc =
             let fun quoted' pos text [] =

          
@@ 144,6 205,11 @@ structure SubXml :> SUBXML = struct
                   | quoted' pos text (x::xs) =
                     if x = quote
                     then OK (rev text, xs, pos+1)
+                    else if x = #"&"
+                    then case entity (pos+1) xs of
+                             ERROR e => ERROR e
+                           | OK (chars, rest, newpos) =>
+                             quoted' newpos (rev chars @ text) rest
                     else quoted' (pos+1) (x::text) xs
             in
                 case quoted' pos [] cc of

          
@@ 245,6 311,10 @@ structure SubXml :> SUBXML = struct
                         #"<" => if text = []
                                 then left (pos+1) acc xs
                                 else left (pos+1) (textOf text :: acc) xs
+                      | #"&" => (case entity (pos+1) xs of
+                                     ERROR e => ERROR e
+                                   | OK (chars, rest, newpos) =>
+                                     outside' newpos (rev chars @ text) acc rest)
                       | x => outside' (pos+1) (x::text) acc xs
             in
                 outside' pos [] acc cc

          
@@ 373,17 443,25 @@ structure SubXml :> SUBXML = struct
                 (map node (List.filter
                                (fn ATTRIBUTE _ => false | _ => true)
                                nodes))
+
+        and encode text =
+            String.translate (fn #"\"" => "&quot;"
+                             | #"&" => "&amp;"
+                             | #"'" => "&apos;"
+                             | #"<" => "&lt;"
+                             | #">" => "&gt;"
+                             | c => str c) text
                 
         and node n =
             case n of
                 TEXT string =>
-                string
+                encode string
               | CDATA string =>
                 "<![CDATA[" ^ string ^ "]]>"
               | COMMENT string =>
-                "<!-- " ^ string ^ "-->"
+                "<!--" ^ string ^ "-->"
               | ATTRIBUTE { name, value } =>
-                name ^ "=" ^ "\"" ^ value ^ "\"" (*!!!*)
+                name ^ "=" ^ "\"" ^ encode value ^ "\""
               | ELEMENT { name, children } =>
                 "<" ^ name ^
                 (case (attributes children) of

          
M test.sh +34 -4
@@ 5,6 5,14 @@ set -eu
 pass=0
 fail=0
 
+if [ ! -d testfiles/good ]; then
+    echo "Run this from the repo root"
+    exit 2
+fi
+
+mkdir -p testfiles/tmp
+outdir=testfiles/tmp
+
 for file in testfiles/bad/*.xml ; do
 
     if ./subxmlparse "$file" ; then

          
@@ 19,10 27,32 @@ done
 
 for file in testfiles/good/*.xml ; do
 
-    if ./subxmlparse "$file" > out.xml ; then
-        if xmllint out.xml >/dev/null ; then
-            echo "--- pass: $file"
-            pass=$(($pass + 1))
+    outfile="$outdir/out.xml"
+    lintfile_orig="$outdir/lint_orig.xml"
+    lintfile_mine="$outdir/lint_mine.xml"
+    
+    if ./subxmlparse "$file" > "$outfile" ; then
+        if xmllint --encode UTF-8 --format --output "$lintfile_mine" "$outfile" >/dev/null ; then
+            xmllint --encode UTF-8 --format "$file" |
+                # Our expected diversion from the original
+                grep -v '<!-- Comment before document, ignored -->' |
+                grep -v '<!-- Comment after document, ignored -->' > "$lintfile_orig"
+            if ! diff -q "$lintfile_orig" "$lintfile_mine" >/dev/null; then
+                echo "*** FAIL: $file output does not match expected"
+                echo; echo "*** Input:"
+                cat "$file"
+                echo "*** Output:"
+                cat "$outfile"
+                echo "*** Linted input:"
+                cat "$lintfile_orig"
+                echo "*** Linted output (should match linted input):"
+                cat "$lintfile_mine"
+                echo "***"; echo
+                fail=$(($fail + 1))
+            else
+                echo "--- pass: $file"
+                pass=$(($pass + 1))
+            fi
         else
             echo "*** FAIL: $file"
             fail=$(($fail + 1))

          
A => testfiles/bad/entities-1.xml +1 -0
@@ 0,0 1,1 @@ 
+<myelement myattribute="Some &quotation;value&quot; with &lt;unknown entities&gt; &amp; &apost;things&apos; in it"/>

          
A => testfiles/bad/entities-2.xml +2 -0
@@ 0,0 1,2 @@ 
+<myelement>Some &apos;text&apostrophe; and the &lt; and &gt; characters expressed as &#x003c;numerical escapes&#X003e;</myelement>
+

          
A => testfiles/bad/entities-3.xml +1 -0
@@ 0,0 1,1 @@ 
+<myelement my&#x0061;ttribute='Some value'/>

          
A => testfiles/bad/entities-4.xml +1 -0
@@ 0,0 1,1 @@ 
+<myelement>Some &apos;text&apos; and the &lt; and &gt; characters expressed as &#x003c;numerical escapes&#x1234003e;</myelement>

          
A => testfiles/good/comment-3.xml +2 -0
@@ 0,0 1,2 @@ 
+<element>content</element>
+<!-- Comment after document, ignored -->

          
A => testfiles/good/entities-1.xml +1 -0
@@ 0,0 1,1 @@ 
+<myelement myattribute="Some &quot;value&quot; &#x003c;with&#062; &lt;entities&gt; &amp; &apos;things&apos; in it"/>

          
A => testfiles/good/entities-2.xml +2 -0
@@ 0,0 1,2 @@ 
+<myelement>Some &apos;text&apos; and the &lt; and &gt; characters expressed as &#x003c;numerical escapes&#62;</myelement>
+

          
A => testfiles/good/entities-3.xml +7 -0
@@ 0,0 1,7 @@ 
+<element>
+
+    &#129335; 🤷 Shrug
+    &#10084; ❤ Heavy Black Heart
+    &#128169; 💩 Pile of Poo
+
+</element>