Complete Ticket #134
M src/rexml/attribute.rb +13 -10
@@ 17,6 17,8 @@ module REXML
 		attr_writer :normalized	
 		PATTERN = /\s*(#{NAME_STR})\s*=\s*(["'])(.*?)\2/um
 
+    NEEDS_A_SECOND_CHECK = /(<|&((#{Entity::NAME});|(#0*((?:\d+)|(?:x[a-fA-F0-9]+)));)?)/um
+
 		# Constructor.
     # FIXME: The parser doesn't catch illegal characters in attributes
     #

          
@@ 110,15 112,16 @@ module REXML
 			end
 		end
 
-		# Returns the attribute value, with entities replaced
-		def to_s
-			return @normalized if @normalized
-
-			doctype = nil
+    def doctype
 			if @element
 				doc = @element.document
 				doctype = doc.doctype if doc
 			end
+    end
+
+		# Returns the attribute value, with entities replaced
+		def to_s
+			return @normalized if @normalized
 
 			@normalized = Text::normalize( @unnormalized, doctype )
 			@unnormalized = nil

          
@@ 129,11 132,6 @@ module REXML
 		# have been expanded to their values
 		def value
 			return @unnormalized if @unnormalized
-			doctype = nil
-			if @element
-				doc = @element.document
-				doctype = doc.doctype if doc
-			end
 			@unnormalized = Text::unnormalize( @normalized, doctype )
 			@normalized = nil
       @unnormalized

          
@@ 150,6 148,11 @@ module REXML
 		# Returns this attribute
 		def element=( element )
 			@element = element
+
+      if @normalized
+        Text.check( @normalized, NEEDS_A_SECOND_CHECK, doctype )
+      end
+
 			self
 		end
 

          
M src/rexml/cdata.rb +1 -1
@@ 13,7 13,7 @@ module REXML
 		#  CData.new( "Here is some CDATA" )
 		#  CData.new( "Some unprocessed data", respect_whitespace_TF, parent_element )
 		def initialize( first, whitespace=true, parent=nil )
-			super( first, whitespace, parent, true, true, ILLEGAL )
+			super( first, whitespace, parent, false, true, ILLEGAL )
 		end
 
 		# Make a copy of this object

          
M src/rexml/parsers/baseparser.rb +10 -5
@@ 25,8 25,16 @@ module REXML
     #
     # Nat Price gave me some good ideas for the API.
     class BaseParser
-      LETTER = 'a-zA-Z'
-      DIGIT = '\d'
+      if String.method_defined? :encode
+        # Oniguruma / POSIX [understands unicode]
+        LETTER = '[[:alpha:]]'
+        DIGIT = '[[:digit:]]'
+      else
+        # Ruby < 1.9 [doesn't understand unicode]
+        LETTER = 'a-zA-Z'
+        DIGIT = '\d'
+      end
+
       COMBININGCHAR = '' # TODO
       EXTENDER = ''      # TODO
 

          
@@ 105,7 113,6 @@ module REXML
       }
 
 
-      ATTR_NEEDS_A_SECOND_CHECK = /(<|&((#{Entity::NAME});|(#0*((?:\d+)|(?:x[a-fA-F0-9]+)));)?)/um
       ######################################################################
       # These are patterns to identify common markup errors, to make the
       # error messages more informative.

          
@@ 397,8 404,6 @@ module REXML
                     prefixes << b unless b == "xml"
                   end
 
-                  Text.check(e, ATTR_NEEDS_A_SECOND_CHECK)
-
                   if attributes.has_key? a
                     msg = "Duplicate attribute #{a.inspect}"
                     raise REXML::ParseException.new( msg, @source, self)

          
M src/rexml/text.rb +25 -18
@@ 117,11 117,16 @@ module REXML
 
       @string.gsub!( /\r\n?/, "\n" )
 
-      Text.check(@string, illegal) if @raw
+      Text.check(@string, NEEDS_A_SECOND_CHECK, doctype) if @raw and @parent
+    end
+
+    def parent= parent
+      super(parent)
+      Text.check(@string, NEEDS_A_SECOND_CHECK, doctype) if @raw and @parent
     end
 
     # check for illegal characters
-    def Text.check string, pattern
+    def Text.check string, pattern, doctype
 
       # illegal anywhere
       if string !~ VALID_XML_CHARS

          
@@ 148,11 153,17 @@ module REXML
       string.scan(pattern).each do
         if $1[-1] != ?;
           raise "Illegal character '#{$1}' in raw string \"#{string}\""
-        elsif $5 and $5[0] == ?#
-          case ($5[1] == ?x ? $5[2..-1].to_i(16) : $5[1..-1].to_i)
-          when *VALID_CHAR
-          else
-            raise "Illegal character '#{$1}' in raw string \"#{string}\""
+        elsif $1[0] == ?&
+          if $5 and $5[0] == ?#
+            case ($5[1] == ?x ? $5[2..-1].to_i(16) : $5[1..-1].to_i)
+            when *VALID_CHAR
+            else
+              raise "Illegal character '#{$1}' in raw string \"#{string}\""
+            end
+          elsif $3 and !SUBSTITUTES.include?($1)
+            if !doctype or !doctype.entities.has_key?($3)
+              raise "Undeclared entity '#{$1}' in raw string \"#{string}\""
+            end
           end
         end
       end

          
@@ 185,6 196,13 @@ module REXML
       to_s() <=> other.to_s
     end
 
+    def doctype
+      if @parent
+        doc = @parent.document
+        doc.doctype if doc
+      end
+    end
+
     REFERENCE = /#{Entity::REFERENCE}/
     # Returns the string value of this text node.  This string is always
     # escaped, meaning that it is a valid XML text node string, and all

          
@@ 203,12 221,6 @@ module REXML
       return @string if @raw
       return @normalized if @normalized
 
-      doctype = nil
-      if @parent
-        doc = @parent.document
-        doctype = doc.doctype if doc
-      end
-
       @normalized = Text::normalize( @string, doctype, @entity_filter )
     end
 

          
@@ 231,11 243,6 @@ module REXML
     #   u.value   #-> "sean russell"
     def value
       return @unnormalized if @unnormalized
-      doctype = nil
-      if @parent
-        doc = @parent.document
-        doctype = doc.doctype if doc
-      end
       @unnormalized = Text::unnormalize( @string, doctype )
     end
 

          
M test/core_test.rb +8 -4
@@ 36,7 36,7 @@ class Tester < Test::Unit::TestCase
       '<0/>',
       '<a>&</a>',
       '<a>&a</a>',
-    # '<a>&a;</a>', TODO
+      '<a>&a;</a>',
       '<a a="<"/>',
       '<a 3="<"/>',
       '<a a="1" a="2"/>',

          
@@ 47,9 47,12 @@ class Tester < Test::Unit::TestCase
       "<a a='&#0;' />",
       "<a>\f</a>",
       "<a a='\f' />",
-      "<a a='&#0;' />",
-    # '<a' + [160].pack('U') + ' />', TODO
-    # '<a a' + [160].pack('U') + '="" />', TODO
+      "<a>\000</a>",
+      '<a' + [65535].pack('U') + ' />',
+      '<a>&#xfffe;</a>',
+      '<a>&#65535;</a>',
+      '<a' + [0x0371].pack('U') + ' />',
+      '<a a' + [0x0371].pack('U') + '="" />',
     ].each do |src|
       assert_raises( ParseException, %Q{Parse #{src.inspect} should have failed!} ) do
         Document.new(src)

          
@@ 1208,6 1211,7 @@ EOL
     xmldoc = Document.new("<test/>")
     xmldoc << XMLDecl.new(XMLDecl::DEFAULT_VERSION, "UTF-8")
     content = ['61c3a927223c3e26'].pack("H*")  
+    content.force_encoding('UTF-8') if content.respond_to?(:force_encoding)
     #- is some UTF-8 text but just to make sure my editor won't magically convert..
     xmldoc.root.add_attribute('attr', content)
     f.write(xmldoc,out=[])