xchar.rb 6.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197
  1. #!/usr/bin/env ruby
  2. # The XChar library is provided courtesy of Sam Ruby (See
  3. # http://intertwingly.net/stories/2005/09/28/xchar.rb)
  4. # --------------------------------------------------------------------
  5. # If the Builder::XChar module is not currently defined, fail on any
  6. # name clashes in standard library classes.
  7. module Builder
  8. def self.check_for_name_collision(klass, method_name, defined_constant=nil)
  9. if klass.method_defined?(method_name.to_s)
  10. fail RuntimeError,
  11. "Name Collision: Method '#{method_name}' is already defined in #{klass}"
  12. end
  13. end
  14. end
  15. if ! defined?(Builder::XChar) and ! String.method_defined?(:encode)
  16. Builder.check_for_name_collision(String, "to_xs")
  17. Builder.check_for_name_collision(Fixnum, "xchr")
  18. end
  19. ######################################################################
  20. module Builder
  21. ####################################################################
  22. # XML Character converter, from Sam Ruby:
  23. # (see http://intertwingly.net/stories/2005/09/28/xchar.rb).
  24. #
  25. module XChar # :nodoc:
  26. # See
  27. # http://intertwingly.net/stories/2004/04/14/i18n.html#CleaningWindows
  28. # for details.
  29. CP1252 = { # :nodoc:
  30. 128 => 8364, # euro sign
  31. 130 => 8218, # single low-9 quotation mark
  32. 131 => 402, # latin small letter f with hook
  33. 132 => 8222, # double low-9 quotation mark
  34. 133 => 8230, # horizontal ellipsis
  35. 134 => 8224, # dagger
  36. 135 => 8225, # double dagger
  37. 136 => 710, # modifier letter circumflex accent
  38. 137 => 8240, # per mille sign
  39. 138 => 352, # latin capital letter s with caron
  40. 139 => 8249, # single left-pointing angle quotation mark
  41. 140 => 338, # latin capital ligature oe
  42. 142 => 381, # latin capital letter z with caron
  43. 145 => 8216, # left single quotation mark
  44. 146 => 8217, # right single quotation mark
  45. 147 => 8220, # left double quotation mark
  46. 148 => 8221, # right double quotation mark
  47. 149 => 8226, # bullet
  48. 150 => 8211, # en dash
  49. 151 => 8212, # em dash
  50. 152 => 732, # small tilde
  51. 153 => 8482, # trade mark sign
  52. 154 => 353, # latin small letter s with caron
  53. 155 => 8250, # single right-pointing angle quotation mark
  54. 156 => 339, # latin small ligature oe
  55. 158 => 382, # latin small letter z with caron
  56. 159 => 376, # latin capital letter y with diaeresis
  57. }
  58. # See http://www.w3.org/TR/REC-xml/#dt-chardata for details.
  59. PREDEFINED = {
  60. 38 => '&', # ampersand
  61. 60 => '<', # left angle bracket
  62. 62 => '>', # right angle bracket
  63. }
  64. # See http://www.w3.org/TR/REC-xml/#charsets for details.
  65. VALID = [
  66. 0x9, 0xA, 0xD,
  67. (0x20..0xD7FF),
  68. (0xE000..0xFFFD),
  69. (0x10000..0x10FFFF)
  70. ]
  71. # http://www.fileformat.info/info/unicode/char/fffd/index.htm
  72. REPLACEMENT_CHAR =
  73. if String.method_defined?(:encode)
  74. "\uFFFD"
  75. elsif $KCODE == 'UTF8'
  76. "\xEF\xBF\xBD"
  77. else
  78. '*'
  79. end
  80. end
  81. end
  82. if String.method_defined?(:encode)
  83. module Builder
  84. module XChar # :nodoc:
  85. CP1252_DIFFERENCES, UNICODE_EQUIVALENT = Builder::XChar::CP1252.each.
  86. inject([[],[]]) {|(domain,range),(key,value)|
  87. [domain << key,range << value]
  88. }.map {|seq| seq.pack('U*').force_encoding('utf-8')}
  89. XML_PREDEFINED = Regexp.new('[' +
  90. Builder::XChar::PREDEFINED.keys.pack('U*').force_encoding('utf-8') +
  91. ']')
  92. INVALID_XML_CHAR = Regexp.new('[^'+
  93. Builder::XChar::VALID.map { |item|
  94. case item
  95. when Fixnum
  96. [item].pack('U').force_encoding('utf-8')
  97. when Range
  98. [item.first, '-'.ord, item.last].pack('UUU').force_encoding('utf-8')
  99. end
  100. }.join +
  101. ']')
  102. ENCODING_BINARY = Encoding.find('BINARY')
  103. ENCODING_UTF8 = Encoding.find('UTF-8')
  104. ENCODING_ISO1 = Encoding.find('ISO-8859-1')
  105. # convert a string to valid UTF-8, compensating for a number of
  106. # common errors.
  107. def XChar.unicode(string)
  108. if string.encoding == ENCODING_BINARY
  109. if string.ascii_only?
  110. string
  111. else
  112. string = string.clone.force_encoding(ENCODING_UTF8)
  113. if string.valid_encoding?
  114. string
  115. else
  116. string.encode(ENCODING_UTF8, ENCODING_ISO1)
  117. end
  118. end
  119. elsif string.encoding == ENCODING_UTF8
  120. if string.valid_encoding?
  121. string
  122. else
  123. string.encode(ENCODING_UTF8, ENCODING_ISO1)
  124. end
  125. else
  126. string.encode(ENCODING_UTF8)
  127. end
  128. end
  129. # encode a string per XML rules
  130. def XChar.encode(string)
  131. unicode(string).
  132. tr(CP1252_DIFFERENCES, UNICODE_EQUIVALENT).
  133. gsub(INVALID_XML_CHAR, REPLACEMENT_CHAR).
  134. gsub(XML_PREDEFINED) {|c| PREDEFINED[c.ord]}
  135. end
  136. end
  137. end
  138. else
  139. ######################################################################
  140. # Enhance the Fixnum class with a XML escaped character conversion.
  141. #
  142. class Fixnum
  143. XChar = Builder::XChar if ! defined?(XChar)
  144. # XML escaped version of chr. When <tt>escape</tt> is set to false
  145. # the CP1252 fix is still applied but utf-8 characters are not
  146. # converted to character entities.
  147. def xchr(escape=true)
  148. n = XChar::CP1252[self] || self
  149. case n when *XChar::VALID
  150. XChar::PREDEFINED[n] or
  151. (n<128 ? n.chr : (escape ? "&##{n};" : [n].pack('U*')))
  152. else
  153. Builder::XChar::REPLACEMENT_CHAR
  154. end
  155. end
  156. end
  157. ######################################################################
  158. # Enhance the String class with a XML escaped character version of
  159. # to_s.
  160. #
  161. class String
  162. # XML escaped version of to_s. When <tt>escape</tt> is set to false
  163. # the CP1252 fix is still applied but utf-8 characters are not
  164. # converted to character entities.
  165. def to_xs(escape=true)
  166. unpack('U*').map {|n| n.xchr(escape)}.join # ASCII, UTF-8
  167. rescue
  168. unpack('C*').map {|n| n.xchr}.join # ISO-8859-1, WIN-1252
  169. end
  170. end
  171. end