M .hgignore +1 -0
@@ 3,3 3,4 @@ syntax: glob
*.log
subxmlparse
out.xml
+testfiles/tmp
M README.md +4 -5
@@ 13,16 13,15 @@ The format supported by SubXml consists
text, CDATA, and comment syntax from XML. It differs from XML in the
following ways:
- * The document is assumed to be in an 8-bit "ASCII-compatible" format
- such as UTF-8; UTF-16 is not supported
+ * The only supported character encoding is UTF-8
* Processing instructions (<? ... ?>) are ignored
* DOCTYPE declarations are ignored; all other declarations (<! ... >)
are rejected except for CDATA, which is handled properly
- * Character and entity references (&-escapes) have no special status
- and are just passed through literally
+ * Comments appearing before or after the root element are ignored;
+ other comments are included in the parsed tree
Note that although the parser is limited, it is not forgiving --
anything it can't understand is rejected with a clear error
@@ 34,6 33,6 @@ An equally simplistic serialiser is also
For a proper XML parser in SML, consider fxp, mirrored at
https://github.com/cannam/fxp.
-Copyright 2018 Chris Cannam.
+Copyright 2018-2021 Chris Cannam.
MIT/X11 licence. See the file COPYING for details.
M subxml.sml +88 -10
@@ 2,7 2,7 @@
(* SubXml - A parser for a subset of XML
=====================================
- https://bitbucket.org/cannam/sml-subxml
+ https://hg.sr.ht/~cannam/sml-subxml
SubXml is a parser and serialiser for a format resembling XML. It
can be used as a minimal parser for small XML configuration or
@@ 13,16 13,18 @@
text, CDATA, and comment syntax from XML. It differs from XML in
the following ways:
- * The document is assumed to be in an 8-bit "ASCII-compatible"
- format such as UTF-8; UTF-16 is not supported
+ * The only supported character encoding is UTF-8
* Processing instructions (<? ... ?>) are ignored
* DOCTYPE declarations are ignored; all other declarations (<!
... >) are rejected except for CDATA, which is handled properly
- * Character and entity references (&-escapes) have no special
- status and are just passed through literally
+ * It follows from the above that only built-in named entities and
+ character entities are decoded (but those ones are)
+
+ * Comments appearing before or after the root element are ignored;
+ other comments are included in the parsed tree
Note that although the parser is limited, it is not forgiving --
anything it can't understand is rejected with a clear error
@@ 32,7 34,7 @@
An equally simplistic serialiser is also provided.
- Copyright 2018 Chris Cannam.
+ Copyright 2018-2021 Chris Cannam.
Permission is hereby granted, free of charge, to any person
obtaining a copy of this software and associated documentation
@@ 136,7 138,66 @@ structure SubXml :> SUBXML = struct
fun tokenError pos token =
error pos ("Unexpected token '" ^ Char.toString token ^ "'")
- val nameEnd = explode " \t\n\r\"'</>!=?"
+ val nameEnd = explode " \t\n\r\"'</>!=?&"
+
+ fun numChar n =
+ let open Word
+ infix 6 orb andb >>
+ fun chars ww = SOME (map (Char.chr o toInt) ww)
+ val c = fromInt n
+ in
+ if c < 0wx80 then
+ chars [c]
+ else if c < 0wx800 then
+ chars [0wxc0 orb (c >> 0w6),
+ 0wx80 orb (c andb 0wx3f)]
+ else if c < 0wx10000 then
+ chars [0wxe0 orb (c >> 0w12),
+ 0wx80 orb ((c >> 0w6) andb 0wx3f),
+ 0wx80 orb (c andb 0wx3f)]
+ else if c < 0wx10ffff then
+ chars [0wxf0 orb (c >> 0w18),
+ 0wx80 orb ((c >> 0w12) andb 0wx3f),
+ 0wx80 orb ((c >> 0w6) andb 0wx3f),
+ 0wx80 orb (c andb 0wx3f)]
+ else NONE
+ end
+
+ fun hexChar h =
+ Option.mapPartial numChar
+ (StringCvt.scanString (Int.scan StringCvt.HEX) h)
+
+ fun decChar d =
+ Option.mapPartial numChar
+ (Int.fromString d)
+
+ fun entity pos cc =
+ let fun entity' decoder pos text [] =
+ error pos "Document ends during hex character entity"
+ | entity' decoder pos text (c :: rest) =
+ if c <> #";"
+ then entity' decoder (pos+1) (c :: text) rest
+ else case decoder (implode (rev text)) of
+ NONE => error pos "Invalid character entity"
+ | SOME chars => OK (chars, rest, pos+1)
+ in
+ case cc of
+ #"q" :: #"u" :: #"o" :: #"t" :: #";" :: rest =>
+ OK ([#"\""], rest, pos+5)
+ | #"a" :: #"m" :: #"p" :: #";" :: rest =>
+ OK ([#"&"], rest, pos+4)
+ | #"a" :: #"p" :: #"o" :: #"s" :: #";" :: rest =>
+ OK ([#"'"], rest, pos+5)
+ | #"l" :: #"t" :: #";" :: rest =>
+ OK ([#"<"], rest, pos+3)
+ | #"g" :: #"t" :: #";" :: rest =>
+ OK ([#">"], rest, pos+3)
+ | #"#" :: #"x" :: rest =>
+ entity' hexChar (pos+2) [] rest
+ | #"#" :: rest =>
+ entity' decChar (pos+1) [] rest
+ | _ => error pos "Invalid entity"
+ end
fun quoted quote pos acc cc =
let fun quoted' pos text [] =
@@ 144,6 205,11 @@ structure SubXml :> SUBXML = struct
| quoted' pos text (x::xs) =
if x = quote
then OK (rev text, xs, pos+1)
+ else if x = #"&"
+ then case entity (pos+1) xs of
+ ERROR e => ERROR e
+ | OK (chars, rest, newpos) =>
+ quoted' newpos (rev chars @ text) rest
else quoted' (pos+1) (x::text) xs
in
case quoted' pos [] cc of
@@ 245,6 311,10 @@ structure SubXml :> SUBXML = struct
#"<" => if text = []
then left (pos+1) acc xs
else left (pos+1) (textOf text :: acc) xs
+ | #"&" => (case entity (pos+1) xs of
+ ERROR e => ERROR e
+ | OK (chars, rest, newpos) =>
+ outside' newpos (rev chars @ text) acc rest)
| x => outside' (pos+1) (x::text) acc xs
in
outside' pos [] acc cc
@@ 373,17 443,25 @@ structure SubXml :> SUBXML = struct
(map node (List.filter
(fn ATTRIBUTE _ => false | _ => true)
nodes))
+
+ and encode text =
+ String.translate (fn #"\"" => """
+ | #"&" => "&"
+ | #"'" => "'"
+ | #"<" => "<"
+ | #">" => ">"
+ | c => str c) text
and node n =
case n of
TEXT string =>
- string
+ encode string
| CDATA string =>
"<![CDATA[" ^ string ^ "]]>"
| COMMENT string =>
- "<!-- " ^ string ^ "-->"
+ "<!--" ^ string ^ "-->"
| ATTRIBUTE { name, value } =>
- name ^ "=" ^ "\"" ^ value ^ "\"" (*!!!*)
+ name ^ "=" ^ "\"" ^ encode value ^ "\""
| ELEMENT { name, children } =>
"<" ^ name ^
(case (attributes children) of
M test.sh +34 -4
@@ 5,6 5,14 @@ set -eu
pass=0
fail=0
+if [ ! -d testfiles/good ]; then
+ echo "Run this from the repo root"
+ exit 2
+fi
+
+mkdir -p testfiles/tmp
+outdir=testfiles/tmp
+
for file in testfiles/bad/*.xml ; do
if ./subxmlparse "$file" ; then
@@ 19,10 27,32 @@ done
for file in testfiles/good/*.xml ; do
- if ./subxmlparse "$file" > out.xml ; then
- if xmllint out.xml >/dev/null ; then
- echo "--- pass: $file"
- pass=$(($pass + 1))
+ outfile="$outdir/out.xml"
+ lintfile_orig="$outdir/lint_orig.xml"
+ lintfile_mine="$outdir/lint_mine.xml"
+
+ if ./subxmlparse "$file" > "$outfile" ; then
+ if xmllint --encode UTF-8 --format --output "$lintfile_mine" "$outfile" >/dev/null ; then
+ xmllint --encode UTF-8 --format "$file" |
+ # Our expected diversion from the original
+ grep -v '<!-- Comment before document, ignored -->' |
+ grep -v '<!-- Comment after document, ignored -->' > "$lintfile_orig"
+ if ! diff -q "$lintfile_orig" "$lintfile_mine" >/dev/null; then
+ echo "*** FAIL: $file output does not match expected"
+ echo; echo "*** Input:"
+ cat "$file"
+ echo "*** Output:"
+ cat "$outfile"
+ echo "*** Linted input:"
+ cat "$lintfile_orig"
+ echo "*** Linted output (should match linted input):"
+ cat "$lintfile_mine"
+ echo "***"; echo
+ fail=$(($fail + 1))
+ else
+ echo "--- pass: $file"
+ pass=$(($pass + 1))
+ fi
else
echo "*** FAIL: $file"
fail=$(($fail + 1))
A => testfiles/bad/entities-1.xml +1 -0
@@ 0,0 1,1 @@
+<myelement myattribute="Some "ation;value" with <unknown entities> & &apost;things' in it"/>
A => testfiles/bad/entities-2.xml +2 -0
@@ 0,0 1,2 @@
+<myelement>Some 'text&apostrophe; and the < and > characters expressed as <numerical escapes></myelement>
+
A => testfiles/bad/entities-3.xml +1 -0
@@ 0,0 1,1 @@
+<myelement myattribute='Some value'/>
A => testfiles/bad/entities-4.xml +1 -0
@@ 0,0 1,1 @@
+<myelement>Some 'text' and the < and > characters expressed as <numerical escapes�</myelement>
A => +2 -0
@@ 0,0 1,2 @@
+<element>content</element>
+<!-- Comment after document, ignored -->
A => testfiles/good/entities-1.xml +1 -0
@@ 0,0 1,1 @@
+<myelement myattribute="Some "value" <with> <entities> & 'things' in it"/>
A => testfiles/good/entities-2.xml +2 -0
@@ 0,0 1,2 @@
+<myelement>Some 'text' and the < and > characters expressed as <numerical escapes></myelement>
+
A => testfiles/good/entities-3.xml +7 -0
@@ 0,0 1,7 @@
+<element>
+
+ 🤷 🤷 Shrug
+ ❤ ❤ Heavy Black Heart
+ 💩 💩 Pile of Poo
+
+</element>