First installment towards addressing Ticket #134
3 files changed, 122 insertions(+), 20 deletions(-)

M src/rexml/parsers/baseparser.rb
M src/rexml/text.rb
M test/core_test.rb
M src/rexml/parsers/baseparser.rb +25 -8
@@ 25,7 25,12 @@ module REXML
     #
     # Nat Price gave me some good ideas for the API.
     class BaseParser
-      NCNAME_STR= '[\w:][\-\w\d.]*'
+      LETTER = 'a-zA-Z'
+      DIGIT = '\d'
+      COMBININGCHAR = '' # TODO
+      EXTENDER = ''      # TODO
+
+      NCNAME_STR= "[#{LETTER}_:][-#{LETTER}#{DIGIT}._:#{COMBININGCHAR}#{EXTENDER}]*"
       NAME_STR= "(?:(#{NCNAME_STR}):)?(#{NCNAME_STR})"
       UNAME_STR= "(?:#{NCNAME_STR}:)?#{NCNAME_STR}"
 

          
@@ 33,7 38,7 @@ module REXML
       NAME = "([\\w:]#{NAMECHAR}*)"
       NMTOKEN = "(?:#{NAMECHAR})+"
       NMTOKENS = "#{NMTOKEN}(\\s+#{NMTOKEN})*"
-      REFERENCE = "(?:&#{NAME};|&#\\d+;|&#x[0-9a-fA-F]+;)"
+      REFERENCE = "&(?:#{NAME};|#\\d+;|#x[0-9a-fA-F]+;)"
       REFERENCE_RE = /#{REFERENCE}/
 
       DOCTYPE_START = /\A\s*<!DOCTYPE\s/um

          
@@ 100,6 105,7 @@ module REXML
       }
 
 
+      ATTR_NEEDS_A_SECOND_CHECK = /(<|&((#{Entity::NAME});|(#0*((?:\d+)|(?:x[a-fA-F0-9]+)));)?)/um
       ######################################################################
       # These are patterns to identify common markup errors, to make the
       # error messages more informative.

          
@@ 340,6 346,12 @@ module REXML
               raise REXML::ParseException.new("Malformed node", @source) unless md
               if md[0][2] == ?-
                 md = @source.match( COMMENT_PATTERN, true )
+
+                case md[1]
+                when /--/, /-$/
+                  raise REXML::ParseException.new("Malformed comment", @source)
+                end
+
                 return [ :comment, md[1] ] if md
               else
                 md = @source.match( CDATA_PATTERN, true )

          
@@ 384,6 396,14 @@ module REXML
                   elsif b
                     prefixes << b unless b == "xml"
                   end
+
+                  Text.check(e, ATTR_NEEDS_A_SECOND_CHECK)
+
+                  if attributes.has_key? a
+                    msg = "Duplicate attribute #{a.inspect}"
+                    raise REXML::ParseException.new( msg, @source, self)
+                  end
+
                   attributes[a] = e 
                 }
               end

          
@@ 470,15 490,12 @@ module REXML
               if entity_value
                 re = /&#{entity_reference};/
                 rv.gsub!( re, entity_value )
+              else
+                er = DEFAULT_ENTITIES[entity_reference]
+                rv.gsub!( er[0], er[2] ) if er
               end
             end
           end
-          matches.each do |entity_reference|
-            unless filter and filter.include?(entity_reference)
-              er = DEFAULT_ENTITIES[entity_reference]
-              rv.gsub!( er[0], er[2] ) if er
-            end
-          end
           rv.gsub!( /&amp;/, '&' )
         end
         rv

          
M src/rexml/text.rb +71 -6
@@ 18,8 18,40 @@ module REXML
     # If +raw+ is true, then REXML leaves the value alone
     attr_accessor :raw
 
-    ILLEGAL = /(<|&(?!(#{Entity::NAME})|(#0*((?:\d+)|(?:x[a-fA-F0-9]+)));))/um
+    NEEDS_A_SECOND_CHECK = /(<|&((#{Entity::NAME});|(#0*((?:\d+)|(?:x[a-fA-F0-9]+)));)?)/um
     NUMERICENTITY = /&#0*((?:\d+)|(?:x[a-fA-F0-9]+));/ 
+    VALID_CHAR = [
+      0x9, 0xA, 0xD,
+      (0x20..0xD7FF),
+      (0xE000..0xFFFD),
+      (0x10000..0x10FFFF)
+    ]
+
+    if String.method_defined? :encode
+      VALID_XML_CHARS = Regexp.new('^['+
+        VALID_CHAR.map { |item|
+          case item
+          when Fixnum
+            [item].pack('U').force_encoding('utf-8')
+          when Range
+            [item.first, '-'.ord, item.last].pack('UUU').force_encoding('utf-8')
+          end
+        }.join +
+      ']*$')
+    else
+      VALID_XML_CHARS = /^(
+           [\x09\x0A\x0D\x20-\x7E]            # ASCII
+         | [\xC2-\xDF][\x80-\xBF]             # non-overlong 2-byte
+         |  \xE0[\xA0-\xBF][\x80-\xBF]        # excluding overlongs
+         | [\xE1-\xEC\xEE][\x80-\xBF]{2}      # straight 3-byte
+         |  \xEF[\x80-\xBE]{2}                #
+         |  \xEF\xBF[\x80-\xBD]               # excluding U+fffe and U+ffff
+         |  \xED[\x80-\x9F][\x80-\xBF]        # excluding surrogates
+         |  \xF0[\x90-\xBF][\x80-\xBF]{2}     # planes 1-3
+         | [\xF1-\xF3][\x80-\xBF]{3}          # planes 4-15
+         |  \xF4[\x80-\x8F][\x80-\xBF]{2}     # plane 16
+       )*$/x; 
+    end
 
     # Constructor
     # +arg+ if a String, the content is set to the String.  If a Text,

          
@@ 58,7 90,7 @@ module REXML
     #
     # +pattern+ INTERNAL USE ONLY
     def initialize(arg, respect_whitespace=false, parent=nil, raw=nil, 
-      entity_filter=nil, illegal=ILLEGAL )
+      entity_filter=nil, illegal=NEEDS_A_SECOND_CHECK )
 
       @raw = false
 

          
@@ 85,10 117,43 @@ module REXML
 
       @string.gsub!( /\r\n?/, "\n" )
 
-      # check for illegal characters
-      if @raw
-        if @string =~ illegal
-          raise "Illegal character '#{$1}' in raw string \"#{@string}\""
+      Text.check(@string, illegal) if @raw
+    end
+
+    # check for illegal characters
+    def Text.check string, pattern
+
+      # illegal anywhere
+      if string !~ VALID_XML_CHARS
+        if String.method_defined? :encode
+          string.chars.each do |c|
+            case c.ord
+            when *VALID_CHAR
+            else
+              raise "Illegal character #{c.inspect} in raw string \"#{string}\""
+            end
+          end
+        else
+          string.scan(/[\x00-\x7F]|[\x80-\xBF][\xC0-\xF0]*|[\xC0-\xF0]/) do |c|
+            case c.unpack('U')
+            when *VALID_CHAR
+            else
+              raise "Illegal character #{c.inspect} in raw string \"#{string}\""
+            end
+          end
+        end
+      end
+
+      # context sensitive
+      string.scan(pattern).each do
+        if $1[-1] != ?;
+          raise "Illegal character '#{$1}' in raw string \"#{string}\""
+        elsif $5 and $5[0] == ?#
+          case ($5[1] == ?x ? $5[2..-1].to_i(16) : $5[1..-1].to_i)
+          when *VALID_CHAR
+          else
+            raise "Illegal character '#{$1}' in raw string \"#{string}\""
+          end
         end
       end
     end

          
M test/core_test.rb +26 -6
@@ 30,11 30,31 @@ class Tester < Test::Unit::TestCase
     EOL
   end
 
-  def test_bad_markup_1
-    src="<pkg='version'> foo </pkg>"
-    assert_raises( ParseException, %Q{Test against "#{src}" should have failed!} )  {
-      Document.new(src)
-    }
+  def test_bad_markup
+    [
+      "<pkg='version'> foo </pkg>",
+      '<0/>',
+      '<a>&</a>',
+      '<a>&a</a>',
+    # '<a>&a;</a>', TODO
+      '<a a="<"/>',
+      '<a 3="<"/>',
+      '<a a="1" a="2"/>',
+      '<a><!-- -- --></a>',
+      '<a><!-- ---></a>',
+      '<a>&#x00;</a>',
+      '<a>&#0;</a>',
+      "<a a='&#0;' />",
+      "<a>\f</a>",
+      "<a a='\f' />",
+      "<a a='&#0;' />",
+    # '<a' + [160].pack('U') + ' />', TODO
+    # '<a a' + [160].pack('U') + '="" />', TODO
+    ].each do |src|
+      assert_raises( ParseException, %Q{Parse #{src.inspect} should have failed!} ) do
+        Document.new(src)
+      end
+    end
   end
 
   def test_attribute

          
@@ 899,7 919,7 @@ EOL
   end
 
   def test_0xD_in_preface
-    doc = "<?xml version=\"1.0\" encoding=\"ISO-8859-1\"?>\0xD<opml version=\"1.0\">\0xD</opml>"
+    doc = "<?xml version=\"1.0\" encoding=\"ISO-8859-1\"?>\x0D<opml version=\"1.0\">\x0D</opml>"
     doc = Document.new doc
   end