| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197 | #!/usr/bin/env ruby# The XChar library is provided courtesy of Sam Ruby (See# http://intertwingly.net/stories/2005/09/28/xchar.rb)# --------------------------------------------------------------------# If the Builder::XChar module is not currently defined, fail on any# name clashes in standard library classes.module Builder  def self.check_for_name_collision(klass, method_name, defined_constant=nil)    if klass.method_defined?(method_name.to_s)      fail RuntimeError,	"Name Collision: Method '#{method_name}' is already defined in #{klass}"    end  endendif ! defined?(Builder::XChar) and ! String.method_defined?(:encode)  Builder.check_for_name_collision(String, "to_xs")  Builder.check_for_name_collision(Fixnum, "xchr")end######################################################################module Builder  ####################################################################  # XML Character converter, from Sam Ruby:  # (see http://intertwingly.net/stories/2005/09/28/xchar.rb).   #  module XChar # :nodoc:    # See    # http://intertwingly.net/stories/2004/04/14/i18n.html#CleaningWindows    # for details.    CP1252 = {			# :nodoc:      128 => 8364,		# euro sign      130 => 8218,		# single low-9 quotation mark      131 =>  402,		# latin small letter f with hook      132 => 8222,		# double low-9 quotation mark      133 => 8230,		# horizontal ellipsis      134 => 8224,		# dagger      135 => 8225,		# double dagger      136 =>  710,		# modifier letter circumflex accent      137 => 8240,		# per mille sign      138 =>  352,		# latin capital letter s with caron      139 => 8249,		# single left-pointing angle quotation mark      140 =>  338,		# latin capital ligature oe      142 =>  381,		# latin capital letter z with caron      145 => 8216,		# left single quotation mark      146 => 8217,		# right single quotation mark      147 => 8220,		# left double quotation mark      148 => 8221,		# right double quotation mark      149 => 8226,		# bullet      150 => 8211,		# en dash      151 => 8212,		# em dash      152 =>  732,		# small tilde      153 => 8482,		# trade mark sign      154 =>  353,		# latin small letter s with caron      155 => 8250,		# single right-pointing angle quotation mark      156 =>  339,		# latin small ligature oe      158 =>  382,		# latin small letter z with caron      159 =>  376,		# latin capital letter y with diaeresis    }    # See http://www.w3.org/TR/REC-xml/#dt-chardata for details.    PREDEFINED = {      38 => '&',		# ampersand      60 => '<',		# left angle bracket      62 => '>',		# right angle bracket    }    # See http://www.w3.org/TR/REC-xml/#charsets for details.    VALID = [      0x9, 0xA, 0xD,      (0x20..0xD7FF),       (0xE000..0xFFFD),      (0x10000..0x10FFFF)    ]    # http://www.fileformat.info/info/unicode/char/fffd/index.htm    REPLACEMENT_CHAR =      if String.method_defined?(:encode)        "\uFFFD"      elsif $KCODE == 'UTF8'        "\xEF\xBF\xBD"      else        '*'      end  endendif String.method_defined?(:encode)  module Builder    module XChar # :nodoc:      CP1252_DIFFERENCES, UNICODE_EQUIVALENT = Builder::XChar::CP1252.each.        inject([[],[]]) {|(domain,range),(key,value)|          [domain << key,range << value]        }.map {|seq| seq.pack('U*').force_encoding('utf-8')}        XML_PREDEFINED = Regexp.new('[' +        Builder::XChar::PREDEFINED.keys.pack('U*').force_encoding('utf-8') +      ']')        INVALID_XML_CHAR = Regexp.new('[^'+        Builder::XChar::VALID.map { |item|          case item          when Fixnum            [item].pack('U').force_encoding('utf-8')          when Range            [item.first, '-'.ord, item.last].pack('UUU').force_encoding('utf-8')          end        }.join +      ']')        ENCODING_BINARY = Encoding.find('BINARY')      ENCODING_UTF8   = Encoding.find('UTF-8')      ENCODING_ISO1   = Encoding.find('ISO-8859-1')      # convert a string to valid UTF-8, compensating for a number of      # common errors.      def XChar.unicode(string)        if string.encoding == ENCODING_BINARY          if string.ascii_only?            string          else            string = string.clone.force_encoding(ENCODING_UTF8)            if string.valid_encoding?              string            else              string.encode(ENCODING_UTF8, ENCODING_ISO1)            end          end        elsif string.encoding == ENCODING_UTF8          if string.valid_encoding?            string          else            string.encode(ENCODING_UTF8, ENCODING_ISO1)          end        else          string.encode(ENCODING_UTF8)        end      end      # encode a string per XML rules      def XChar.encode(string)        unicode(string).          tr(CP1252_DIFFERENCES, UNICODE_EQUIVALENT).          gsub(INVALID_XML_CHAR, REPLACEMENT_CHAR).          gsub(XML_PREDEFINED) {|c| PREDEFINED[c.ord]}      end    end  endelse  ######################################################################  # Enhance the Fixnum class with a XML escaped character conversion.  #  class Fixnum    XChar = Builder::XChar if ! defined?(XChar)      # XML escaped version of chr. When <tt>escape</tt> is set to false    # the CP1252 fix is still applied but utf-8 characters are not    # converted to character entities.    def xchr(escape=true)      n = XChar::CP1252[self] || self      case n when *XChar::VALID        XChar::PREDEFINED[n] or           (n<128 ? n.chr : (escape ? "&##{n};" : [n].pack('U*')))      else        Builder::XChar::REPLACEMENT_CHAR      end    end  end    ######################################################################  # Enhance the String class with a XML escaped character version of  # to_s.  #  class String    # XML escaped version of to_s. When <tt>escape</tt> is set to false    # the CP1252 fix is still applied but utf-8 characters are not    # converted to character entities.    def to_xs(escape=true)      unpack('U*').map {|n| n.xchr(escape)}.join # ASCII, UTF-8    rescue      unpack('C*').map {|n| n.xchr}.join # ISO-8859-1, WIN-1252    end  endend
 |