#!/usr/bin/env ruby
# The XChar library is provided courtesy of Sam Ruby (See
# http://intertwingly.net/stories/2005/09/28/xchar.rb)
# --------------------------------------------------------------------
# If the Builder::XChar module is not currently defined, fail on any
# name clashes in standard library classes.
module Builder
def self.check_for_name_collision(klass, method_name, defined_constant=nil)
if klass.method_defined?(method_name.to_s)
fail RuntimeError,
"Name Collision: Method '#{method_name}' is already defined in #{klass}"
end
end
end
if ! defined?(Builder::XChar) and ! String.method_defined?(:encode)
Builder.check_for_name_collision(String, "to_xs")
Builder.check_for_name_collision(Fixnum, "xchr")
end
######################################################################
module Builder
####################################################################
# XML Character converter, from Sam Ruby:
# (see http://intertwingly.net/stories/2005/09/28/xchar.rb).
#
module XChar # :nodoc:
# See
# http://intertwingly.net/stories/2004/04/14/i18n.html#CleaningWindows
# for details.
CP1252 = { # :nodoc:
128 => 8364, # euro sign
130 => 8218, # single low-9 quotation mark
131 => 402, # latin small letter f with hook
132 => 8222, # double low-9 quotation mark
133 => 8230, # horizontal ellipsis
134 => 8224, # dagger
135 => 8225, # double dagger
136 => 710, # modifier letter circumflex accent
137 => 8240, # per mille sign
138 => 352, # latin capital letter s with caron
139 => 8249, # single left-pointing angle quotation mark
140 => 338, # latin capital ligature oe
142 => 381, # latin capital letter z with caron
145 => 8216, # left single quotation mark
146 => 8217, # right single quotation mark
147 => 8220, # left double quotation mark
148 => 8221, # right double quotation mark
149 => 8226, # bullet
150 => 8211, # en dash
151 => 8212, # em dash
152 => 732, # small tilde
153 => 8482, # trade mark sign
154 => 353, # latin small letter s with caron
155 => 8250, # single right-pointing angle quotation mark
156 => 339, # latin small ligature oe
158 => 382, # latin small letter z with caron
159 => 376, # latin capital letter y with diaeresis
}
# See http://www.w3.org/TR/REC-xml/#dt-chardata for details.
PREDEFINED = {
38 => '&', # ampersand
60 => '<', # left angle bracket
62 => '>', # right angle bracket
}
# See http://www.w3.org/TR/REC-xml/#charsets for details.
VALID = [
0x9, 0xA, 0xD,
(0x20..0xD7FF),
(0xE000..0xFFFD),
(0x10000..0x10FFFF)
]
# http://www.fileformat.info/info/unicode/char/fffd/index.htm
REPLACEMENT_CHAR =
if String.method_defined?(:encode)
"\uFFFD"
elsif $KCODE == 'UTF8'
"\xEF\xBF\xBD"
else
'*'
end
end
end
if String.method_defined?(:encode)
module Builder
module XChar # :nodoc:
CP1252_DIFFERENCES, UNICODE_EQUIVALENT = Builder::XChar::CP1252.each.
inject([[],[]]) {|(domain,range),(key,value)|
[domain << key,range << value]
}.map {|seq| seq.pack('U*').force_encoding('utf-8')}
XML_PREDEFINED = Regexp.new('[' +
Builder::XChar::PREDEFINED.keys.pack('U*').force_encoding('utf-8') +
']')
INVALID_XML_CHAR = Regexp.new('[^'+
Builder::XChar::VALID.map { |item|
case item
when Fixnum
[item].pack('U').force_encoding('utf-8')
when Range
[item.first, '-'.ord, item.last].pack('UUU').force_encoding('utf-8')
end
}.join +
']')
ENCODING_BINARY = Encoding.find('BINARY')
ENCODING_UTF8 = Encoding.find('UTF-8')
ENCODING_ISO1 = Encoding.find('ISO-8859-1')
# convert a string to valid UTF-8, compensating for a number of
# common errors.
def XChar.unicode(string)
if string.encoding == ENCODING_BINARY
if string.ascii_only?
string
else
string = string.clone.force_encoding(ENCODING_UTF8)
if string.valid_encoding?
string
else
string.encode(ENCODING_UTF8, ENCODING_ISO1)
end
end
elsif string.encoding == ENCODING_UTF8
if string.valid_encoding?
string
else
string.encode(ENCODING_UTF8, ENCODING_ISO1)
end
else
string.encode(ENCODING_UTF8)
end
end
# encode a string per XML rules
def XChar.encode(string)
unicode(string).
tr(CP1252_DIFFERENCES, UNICODE_EQUIVALENT).
gsub(INVALID_XML_CHAR, REPLACEMENT_CHAR).
gsub(XML_PREDEFINED) {|c| PREDEFINED[c.ord]}
end
end
end
else
######################################################################
# Enhance the Fixnum class with a XML escaped character conversion.
#
class Fixnum
XChar = Builder::XChar if ! defined?(XChar)
# XML escaped version of chr. When escape is set to false
# the CP1252 fix is still applied but utf-8 characters are not
# converted to character entities.
def xchr(escape=true)
n = XChar::CP1252[self] || self
case n when *XChar::VALID
XChar::PREDEFINED[n] or
(n<128 ? n.chr : (escape ? "#{n};" : [n].pack('U*')))
else
Builder::XChar::REPLACEMENT_CHAR
end
end
end
######################################################################
# Enhance the String class with a XML escaped character version of
# to_s.
#
class String
# XML escaped version of to_s. When escape is set to false
# the CP1252 fix is still applied but utf-8 characters are not
# converted to character entities.
def to_xs(escape=true)
unpack('U*').map {|n| n.xchr(escape)}.join # ASCII, UTF-8
rescue
unpack('C*').map {|n| n.xchr}.join # ISO-8859-1, WIN-1252
end
end
end