| [1329] | 1 | diff --git a/activesupport/lib/active_support/multibyte.rb b/activesupport/lib/active_support/multibyte.rb | 
|---|
|  | 2 | index 65a96af..b6354ee 100644 | 
|---|
|  | 3 | --- a/activesupport/lib/active_support/multibyte.rb | 
|---|
|  | 4 | +++ b/activesupport/lib/active_support/multibyte.rb | 
|---|
|  | 5 | @@ -1,9 +1,5 @@ | 
|---|
|  | 6 | # encoding: utf-8 | 
|---|
|  | 7 |  | 
|---|
|  | 8 | -require 'active_support/multibyte/chars' | 
|---|
|  | 9 | -require 'active_support/multibyte/exceptions' | 
|---|
|  | 10 | -require 'active_support/multibyte/unicode_database' | 
|---|
|  | 11 | - | 
|---|
|  | 12 | module ActiveSupport #:nodoc: | 
|---|
|  | 13 | module Multibyte | 
|---|
|  | 14 | # A list of all available normalization forms. See http://www.unicode.org/reports/tr15/tr15-29.html for more | 
|---|
|  | 15 | @@ -27,7 +23,35 @@ module ActiveSupport #:nodoc: | 
|---|
|  | 16 | # | 
|---|
|  | 17 | # Example: | 
|---|
|  | 18 | #   ActiveSupport::Multibyte.proxy_class = CharsForUTF32 | 
|---|
|  | 19 | -    mattr_accessor :proxy_class | 
|---|
|  | 20 | -    self.proxy_class = ActiveSupport::Multibyte::Chars | 
|---|
|  | 21 | +    def self.proxy_class=(klass) | 
|---|
|  | 22 | +      @proxy_class = klass | 
|---|
|  | 23 | +    end | 
|---|
|  | 24 | + | 
|---|
|  | 25 | +    # Returns the currect proxy class | 
|---|
|  | 26 | +    def self.proxy_class | 
|---|
|  | 27 | +      @proxy_class ||= ActiveSupport::Multibyte::Chars | 
|---|
|  | 28 | +    end | 
|---|
|  | 29 | + | 
|---|
|  | 30 | +    # Regular expressions that describe valid byte sequences for a character | 
|---|
|  | 31 | +    VALID_CHARACTER = { | 
|---|
|  | 32 | +      # Borrowed from the Kconv library by Shinji KONO - (also as seen on the W3C site) | 
|---|
|  | 33 | +      'UTF-8' => /\A(?: | 
|---|
|  | 34 | +                  [\x00-\x7f]                                         | | 
|---|
|  | 35 | +                  [\xc2-\xdf] [\x80-\xbf]                             | | 
|---|
|  | 36 | +                  \xe0        [\xa0-\xbf] [\x80-\xbf]                 | | 
|---|
|  | 37 | +                  [\xe1-\xef] [\x80-\xbf] [\x80-\xbf]                 | | 
|---|
|  | 38 | +                  \xf0        [\x90-\xbf] [\x80-\xbf] [\x80-\xbf]     | | 
|---|
|  | 39 | +                  [\xf1-\xf3] [\x80-\xbf] [\x80-\xbf] [\x80-\xbf]     | | 
|---|
|  | 40 | +                  \xf4        [\x80-\x8f] [\x80-\xbf] [\x80-\xbf])\z /xn, | 
|---|
|  | 41 | +      # Quick check for valid Shift-JIS characters, disregards the odd-even pairing | 
|---|
|  | 42 | +      'Shift_JIS' => /\A(?: | 
|---|
|  | 43 | +                  [\x00-\x7e \xa1-\xdf]                                     | | 
|---|
|  | 44 | +                  [\x81-\x9f \xe0-\xef] [\x40-\x7e \x80-\x9e \x9f-\xfc])\z /xn | 
|---|
|  | 45 | +    } | 
|---|
|  | 46 | end | 
|---|
|  | 47 | end | 
|---|
|  | 48 | + | 
|---|
|  | 49 | +require 'active_support/multibyte/chars' | 
|---|
|  | 50 | +require 'active_support/multibyte/exceptions' | 
|---|
|  | 51 | +require 'active_support/multibyte/unicode_database' | 
|---|
|  | 52 | +require 'active_support/multibyte/utils' | 
|---|
|  | 53 | diff --git a/activesupport/lib/active_support/multibyte/chars.rb b/activesupport/lib/active_support/multibyte/chars.rb | 
|---|
|  | 54 | index 3d392d2..16bc130 100644 | 
|---|
|  | 55 | --- a/activesupport/lib/active_support/multibyte/chars.rb | 
|---|
|  | 56 | +++ b/activesupport/lib/active_support/multibyte/chars.rb | 
|---|
|  | 57 | @@ -73,16 +73,7 @@ module ActiveSupport #:nodoc: | 
|---|
|  | 58 | UNICODE_TRAILERS_PAT = /(#{codepoints_to_pattern(UNICODE_LEADERS_AND_TRAILERS)})+\Z/ | 
|---|
|  | 59 | UNICODE_LEADERS_PAT = /\A(#{codepoints_to_pattern(UNICODE_LEADERS_AND_TRAILERS)})+/ | 
|---|
|  | 60 |  | 
|---|
|  | 61 | -      # Borrowed from the Kconv library by Shinji KONO - (also as seen on the W3C site) | 
|---|
|  | 62 | -      UTF8_PAT = /\A(?: | 
|---|
|  | 63 | -                     [\x00-\x7f]                                     | | 
|---|
|  | 64 | -                     [\xc2-\xdf] [\x80-\xbf]                         | | 
|---|
|  | 65 | -                     \xe0        [\xa0-\xbf] [\x80-\xbf]             | | 
|---|
|  | 66 | -                     [\xe1-\xef] [\x80-\xbf] [\x80-\xbf]             | | 
|---|
|  | 67 | -                     \xf0        [\x90-\xbf] [\x80-\xbf] [\x80-\xbf] | | 
|---|
|  | 68 | -                     [\xf1-\xf3] [\x80-\xbf] [\x80-\xbf] [\x80-\xbf] | | 
|---|
|  | 69 | -                     \xf4        [\x80-\x8f] [\x80-\xbf] [\x80-\xbf] | 
|---|
|  | 70 | -                    )*\z/xn | 
|---|
|  | 71 | +      UTF8_PAT = ActiveSupport::Multibyte::VALID_CHARACTER['UTF-8'] | 
|---|
|  | 72 |  | 
|---|
|  | 73 | attr_reader :wrapped_string | 
|---|
|  | 74 | alias to_s wrapped_string | 
|---|
|  | 75 | @@ -307,23 +298,23 @@ module ActiveSupport #:nodoc: | 
|---|
|  | 76 | def rstrip | 
|---|
|  | 77 | chars(@wrapped_string.gsub(UNICODE_TRAILERS_PAT, '')) | 
|---|
|  | 78 | end | 
|---|
|  | 79 | - | 
|---|
|  | 80 | + | 
|---|
|  | 81 | # Strips entire range of Unicode whitespace from the left of the string. | 
|---|
|  | 82 | def lstrip | 
|---|
|  | 83 | chars(@wrapped_string.gsub(UNICODE_LEADERS_PAT, '')) | 
|---|
|  | 84 | end | 
|---|
|  | 85 | - | 
|---|
|  | 86 | + | 
|---|
|  | 87 | # Strips entire range of Unicode whitespace from the right and left of the string. | 
|---|
|  | 88 | def strip | 
|---|
|  | 89 | rstrip.lstrip | 
|---|
|  | 90 | end | 
|---|
|  | 91 | - | 
|---|
|  | 92 | + | 
|---|
|  | 93 | # Returns the number of codepoints in the string | 
|---|
|  | 94 | def size | 
|---|
|  | 95 | self.class.u_unpack(@wrapped_string).size | 
|---|
|  | 96 | end | 
|---|
|  | 97 | alias_method :length, :size | 
|---|
|  | 98 | - | 
|---|
|  | 99 | + | 
|---|
|  | 100 | # Reverses all characters in the string. | 
|---|
|  | 101 | # | 
|---|
|  | 102 | # Example: | 
|---|
|  | 103 | @@ -331,7 +322,7 @@ module ActiveSupport #:nodoc: | 
|---|
|  | 104 | def reverse | 
|---|
|  | 105 | chars(self.class.u_unpack(@wrapped_string).reverse.pack('U*')) | 
|---|
|  | 106 | end | 
|---|
|  | 107 | - | 
|---|
|  | 108 | + | 
|---|
|  | 109 | # Implements Unicode-aware slice with codepoints. Slicing on one point returns the codepoints for that | 
|---|
|  | 110 | # character. | 
|---|
|  | 111 | # | 
|---|
|  | 112 | @@ -646,7 +637,7 @@ module ActiveSupport #:nodoc: | 
|---|
|  | 113 | string.split(//u).map do |c| | 
|---|
|  | 114 | c.force_encoding(Encoding::ASCII) if c.respond_to?(:force_encoding) | 
|---|
|  | 115 |  | 
|---|
|  | 116 | -            if !UTF8_PAT.match(c) | 
|---|
|  | 117 | +            if !ActiveSupport::Multibyte::VALID_CHARACTER['UTF-8'].match(c) | 
|---|
|  | 118 | n = c.unpack('C')[0] | 
|---|
|  | 119 | n < 128 ? n.chr : | 
|---|
|  | 120 | n < 160 ? [UCD.cp1252[n] || n].pack('U') : | 
|---|
|  | 121 | diff --git a/activesupport/lib/active_support/multibyte/utils.rb b/activesupport/lib/active_support/multibyte/utils.rb | 
|---|
|  | 122 | new file mode 100644 | 
|---|
|  | 123 | index 0000000..acef84d | 
|---|
|  | 124 | --- /dev/null | 
|---|
|  | 125 | +++ b/activesupport/lib/active_support/multibyte/utils.rb | 
|---|
|  | 126 | @@ -0,0 +1,61 @@ | 
|---|
|  | 127 | +# encoding: utf-8 | 
|---|
|  | 128 | + | 
|---|
|  | 129 | +module ActiveSupport #:nodoc: | 
|---|
|  | 130 | +  module Multibyte #:nodoc: | 
|---|
|  | 131 | +    if Kernel.const_defined?(:Encoding) | 
|---|
|  | 132 | +      # Returns a regular expression that matches valid characters in the current encoding | 
|---|
|  | 133 | +      def self.valid_character | 
|---|
|  | 134 | +        VALID_CHARACTER[Encoding.default_internal.to_s] | 
|---|
|  | 135 | +      end | 
|---|
|  | 136 | +    else | 
|---|
|  | 137 | +      def self.valid_character | 
|---|
|  | 138 | +        case $KCODE | 
|---|
|  | 139 | +        when 'UTF8' | 
|---|
|  | 140 | +          VALID_CHARACTER['UTF-8'] | 
|---|
|  | 141 | +        when 'SJIS' | 
|---|
|  | 142 | +          VALID_CHARACTER['Shift_JIS'] | 
|---|
|  | 143 | +        end | 
|---|
|  | 144 | +      end | 
|---|
|  | 145 | +    end | 
|---|
|  | 146 | + | 
|---|
|  | 147 | +    if 'string'.respond_to?(:valid_encoding?) | 
|---|
|  | 148 | +      # Verifies the encoding of a string | 
|---|
|  | 149 | +      def self.verify(string) | 
|---|
|  | 150 | +        string.valid_encoding? | 
|---|
|  | 151 | +      end | 
|---|
|  | 152 | +    else | 
|---|
|  | 153 | +      def self.verify(string) | 
|---|
|  | 154 | +        if expression = valid_character | 
|---|
|  | 155 | +          for c in string.split(//) | 
|---|
|  | 156 | +            return false unless valid_character.match(c) | 
|---|
|  | 157 | +          end | 
|---|
|  | 158 | +        end | 
|---|
|  | 159 | +        true | 
|---|
|  | 160 | +      end | 
|---|
|  | 161 | +    end | 
|---|
|  | 162 | + | 
|---|
|  | 163 | +    # Verifies the encoding of the string and raises an exception when it's not valid | 
|---|
|  | 164 | +    def self.verify!(string) | 
|---|
|  | 165 | +      raise EncodingError.new("Found characters with invalid encoding") unless verify(string) | 
|---|
|  | 166 | +    end | 
|---|
|  | 167 | + | 
|---|
|  | 168 | +    if 'string'.respond_to?(:force_encoding) | 
|---|
|  | 169 | +      # Removes all invalid characters from the string. | 
|---|
|  | 170 | +      # | 
|---|
|  | 171 | +      # Note: this method is a no-op in Ruby 1.9 | 
|---|
|  | 172 | +      def self.clean(string) | 
|---|
|  | 173 | +        string | 
|---|
|  | 174 | +      end | 
|---|
|  | 175 | +    else | 
|---|
|  | 176 | +      def self.clean(string) | 
|---|
|  | 177 | +        if expression = valid_character | 
|---|
|  | 178 | +          stripped = []; for c in string.split(//) | 
|---|
|  | 179 | +            stripped << c if valid_character.match(c) | 
|---|
|  | 180 | +          end; stripped.join | 
|---|
|  | 181 | +        else | 
|---|
|  | 182 | +          string | 
|---|
|  | 183 | +        end | 
|---|
|  | 184 | +      end | 
|---|
|  | 185 | +    end | 
|---|
|  | 186 | +  end | 
|---|
|  | 187 | +end | 
|---|
|  | 188 | \ No newline at end of file | 
|---|
|  | 189 | diff --git a/activesupport/test/multibyte_utils_test.rb b/activesupport/test/multibyte_utils_test.rb | 
|---|
|  | 190 | new file mode 100644 | 
|---|
|  | 191 | index 0000000..d8ac5ff | 
|---|
|  | 192 | --- /dev/null | 
|---|
|  | 193 | +++ b/activesupport/test/multibyte_utils_test.rb | 
|---|
|  | 194 | @@ -0,0 +1,141 @@ | 
|---|
|  | 195 | +# encoding: utf-8 | 
|---|
|  | 196 | + | 
|---|
|  | 197 | +require 'abstract_unit' | 
|---|
|  | 198 | +require 'multibyte_test_helpers' | 
|---|
|  | 199 | + | 
|---|
|  | 200 | +class MultibyteUtilsTest < ActiveSupport::TestCase | 
|---|
|  | 201 | +  include MultibyteTestHelpers | 
|---|
|  | 202 | + | 
|---|
|  | 203 | +  test "valid_character returns an expression for the current encoding" do | 
|---|
|  | 204 | +    with_encoding('None') do | 
|---|
|  | 205 | +      assert_nil ActiveSupport::Multibyte.valid_character | 
|---|
|  | 206 | +    end | 
|---|
|  | 207 | +    with_encoding('UTF8') do | 
|---|
|  | 208 | +      assert_equal ActiveSupport::Multibyte::VALID_CHARACTER['UTF-8'], ActiveSupport::Multibyte.valid_character | 
|---|
|  | 209 | +    end | 
|---|
|  | 210 | +    with_encoding('SJIS') do | 
|---|
|  | 211 | +      assert_equal ActiveSupport::Multibyte::VALID_CHARACTER['Shift_JIS'], ActiveSupport::Multibyte.valid_character | 
|---|
|  | 212 | +    end | 
|---|
|  | 213 | +  end | 
|---|
|  | 214 | + | 
|---|
|  | 215 | +  test "verify verifies ASCII strings are properly encoded" do | 
|---|
|  | 216 | +    with_encoding('None') do | 
|---|
|  | 217 | +      examples.each do |example| | 
|---|
|  | 218 | +        assert ActiveSupport::Multibyte.verify(example) | 
|---|
|  | 219 | +      end | 
|---|
|  | 220 | +    end | 
|---|
|  | 221 | +  end | 
|---|
|  | 222 | + | 
|---|
|  | 223 | +  test "verify verifies UTF-8 strings are properly encoded" do | 
|---|
|  | 224 | +    with_encoding('UTF8') do | 
|---|
|  | 225 | +      assert ActiveSupport::Multibyte.verify(example('valid UTF-8')) | 
|---|
|  | 226 | +      assert !ActiveSupport::Multibyte.verify(example('invalid UTF-8')) | 
|---|
|  | 227 | +    end | 
|---|
|  | 228 | +  end | 
|---|
|  | 229 | + | 
|---|
|  | 230 | +  test "verify verifies Shift-JIS strings are properly encoded" do | 
|---|
|  | 231 | +    with_encoding('SJIS') do | 
|---|
|  | 232 | +      assert ActiveSupport::Multibyte.verify(example('valid Shift-JIS')) | 
|---|
|  | 233 | +      assert !ActiveSupport::Multibyte.verify(example('invalid Shift-JIS')) | 
|---|
|  | 234 | +    end | 
|---|
|  | 235 | +  end | 
|---|
|  | 236 | + | 
|---|
|  | 237 | +  test "verify! raises an exception when it finds an invalid character" do | 
|---|
|  | 238 | +    with_encoding('UTF8') do | 
|---|
|  | 239 | +      assert_raises(ActiveSupport::Multibyte::EncodingError) do | 
|---|
|  | 240 | +        ActiveSupport::Multibyte.verify!(example('invalid UTF-8')) | 
|---|
|  | 241 | +      end | 
|---|
|  | 242 | +    end | 
|---|
|  | 243 | +  end | 
|---|
|  | 244 | + | 
|---|
|  | 245 | +  test "verify! doesn't raise an exception when the encoding is valid" do | 
|---|
|  | 246 | +    with_encoding('UTF8') do | 
|---|
|  | 247 | +      assert_nothing_raised do | 
|---|
|  | 248 | +        ActiveSupport::Multibyte.verify!(example('valid UTF-8')) | 
|---|
|  | 249 | +      end | 
|---|
|  | 250 | +    end | 
|---|
|  | 251 | +  end | 
|---|
|  | 252 | + | 
|---|
|  | 253 | +  if RUBY_VERSION < '1.9' | 
|---|
|  | 254 | +    test "clean leaves ASCII strings intact" do | 
|---|
|  | 255 | +      with_encoding('None') do | 
|---|
|  | 256 | +        [ | 
|---|
|  | 257 | +          'word', "\270\236\010\210\245" | 
|---|
|  | 258 | +        ].each do |string| | 
|---|
|  | 259 | +          assert_equal string, ActiveSupport::Multibyte.clean(string) | 
|---|
|  | 260 | +        end | 
|---|
|  | 261 | +      end | 
|---|
|  | 262 | +    end | 
|---|
|  | 263 | + | 
|---|
|  | 264 | +    test "clean cleans invalid characters from UTF-8 encoded strings" do | 
|---|
|  | 265 | +      with_encoding('UTF8') do | 
|---|
|  | 266 | +        cleaned_utf8 = [8].pack('C*') | 
|---|
|  | 267 | +        assert_equal example('valid UTF-8'), ActiveSupport::Multibyte.clean(example('valid UTF-8')) | 
|---|
|  | 268 | +        assert_equal cleaned_utf8, ActiveSupport::Multibyte.clean(example('invalid UTF-8')) | 
|---|
|  | 269 | +      end | 
|---|
|  | 270 | +    end | 
|---|
|  | 271 | + | 
|---|
|  | 272 | +    test "clean cleans invalid characters from Shift-JIS encoded strings" do | 
|---|
|  | 273 | +      with_encoding('SJIS') do | 
|---|
|  | 274 | +        cleaned_sjis = [184, 0, 136, 165].pack('C*') | 
|---|
|  | 275 | +        assert_equal example('valid Shift-JIS'), ActiveSupport::Multibyte.clean(example('valid Shift-JIS')) | 
|---|
|  | 276 | +        assert_equal cleaned_sjis, ActiveSupport::Multibyte.clean(example('invalid Shift-JIS')) | 
|---|
|  | 277 | +      end | 
|---|
|  | 278 | +    end | 
|---|
|  | 279 | +  else | 
|---|
|  | 280 | +    test "clean is a no-op" do | 
|---|
|  | 281 | +      with_encoding('UTF8') do | 
|---|
|  | 282 | +        assert_equal example('invalid Shift-JIS'), ActiveSupport::Multibyte.clean(example('invalid Shift-JIS')) | 
|---|
|  | 283 | +      end | 
|---|
|  | 284 | +    end | 
|---|
|  | 285 | +  end | 
|---|
|  | 286 | + | 
|---|
|  | 287 | +  private | 
|---|
|  | 288 | + | 
|---|
|  | 289 | +  STRINGS = { | 
|---|
|  | 290 | +    'valid ASCII'       => [65, 83, 67, 73, 73].pack('C*'), | 
|---|
|  | 291 | +    'invalid ASCII'     => [128].pack('C*'), | 
|---|
|  | 292 | +    'valid UTF-8'       => [227, 129, 147, 227, 129, 171, 227, 129, 161, 227, 130, 143].pack('C*'), | 
|---|
|  | 293 | +    'invalid UTF-8'     => [184, 158, 8, 136, 165].pack('C*'), | 
|---|
|  | 294 | +    'valid Shift-JIS'   => [131, 122, 129, 91, 131, 128].pack('C*'), | 
|---|
|  | 295 | +    'invalid Shift-JIS' => [184, 158, 8, 0, 255, 136, 165].pack('C*') | 
|---|
|  | 296 | +  } | 
|---|
|  | 297 | + | 
|---|
|  | 298 | +  if Kernel.const_defined?(:Encoding) | 
|---|
|  | 299 | +    def example(key) | 
|---|
|  | 300 | +      STRINGS[key].force_encoding(Encoding.default_internal) | 
|---|
|  | 301 | +    end | 
|---|
|  | 302 | + | 
|---|
|  | 303 | +    def examples | 
|---|
|  | 304 | +      STRINGS.values.map { |s| s.force_encoding(Encoding.default_internal) } | 
|---|
|  | 305 | +    end | 
|---|
|  | 306 | +  else | 
|---|
|  | 307 | +    def example(key) | 
|---|
|  | 308 | +      STRINGS[key] | 
|---|
|  | 309 | +    end | 
|---|
|  | 310 | + | 
|---|
|  | 311 | +    def examples | 
|---|
|  | 312 | +      STRINGS.values | 
|---|
|  | 313 | +    end | 
|---|
|  | 314 | +  end | 
|---|
|  | 315 | + | 
|---|
|  | 316 | +  if 'string'.respond_to?(:encoding) | 
|---|
|  | 317 | +    def with_encoding(enc) | 
|---|
|  | 318 | +      before = Encoding.default_internal | 
|---|
|  | 319 | + | 
|---|
|  | 320 | +      case enc | 
|---|
|  | 321 | +      when 'UTF8' | 
|---|
|  | 322 | +        Encoding.default_internal = Encoding::UTF_8 | 
|---|
|  | 323 | +      when 'SJIS' | 
|---|
|  | 324 | +        Encoding.default_internal = Encoding::Shift_JIS | 
|---|
|  | 325 | +      else | 
|---|
|  | 326 | +        Encoding.default_internal = Encoding::BINARY | 
|---|
|  | 327 | +      end | 
|---|
|  | 328 | +      yield | 
|---|
|  | 329 | + | 
|---|
|  | 330 | +      Encoding.default_internal = before | 
|---|
|  | 331 | +    end | 
|---|
|  | 332 | +  else | 
|---|
|  | 333 | +    alias with_encoding with_kcode | 
|---|
|  | 334 | +  end | 
|---|
|  | 335 | +end | 
|---|
|  | 336 | \ No newline at end of file | 
|---|
|  | 337 |  | 
|---|