文字列をマルチバイト文字を意識せずに扱えるようなラッパークラス
まだ作業途中のネタなのだけど、Rubyレシピブックを見てみるとマルチバイト文字列の処理にはいろいろとバッドノウハウを覚えないといけないくさいのでStringクラスのインスタンスをMultibyteStringクラスに変換することでマルチバイト文字を意識せず扱えるクラスを定義してみたのだけどどうだろうか。
# multibyte_string.rb =begin =VERSION= Multibyte String ver 0.0.1 This library is String good wrapper for string which include multibyte characters. Multibyte String means string which include multibyte characters. If you use this, you can use it as just a string. You need not use string contiously whether it has multibyte characters or not. =SYNOPOS= $KCODE = "EUC" require 'multybyte_string' str = "あいうえお".to_mbstr # create MultibyteString instance from String. str[1] #=> "い" str.size #=> 5 str.index("う") #=> 2 =end require 'delegate' #=Warning= # if $KCODE is "NONE", you cannot use this library correctly. # Please set $KCODE to multibyte character supportted mode. # (for example, "UTF-8", "EUC", "S_JIS") if $KCODE == "NONE" warn "$KCODE was not setted. Please set $KCODE." end class MultibyteString < DelegateClass(String) def initialize(str) super str end def slice(nth, len=nil) case nth when Regexp if nth =~ self $~.to_s else return nil end else if len.nil? case nth when Fixnum self.split(//)[nth] when Range self.split(//)[nth].join when String if self.include? nth nth else nil end end else self.split(//)[nth, len].join end end end alias [] slice def []=(nth, val) end def slice!(nth, len=nil) end alias bytes size # Return length of string. To know about string's byte length, # use MultibyteString#bytes. def length self.split(//).size end alias size length def count(str) counter = 0 self.scan(str) do counter += 1 end counter end def index(regex, pos=0) self.split(//).slice(pos, self.length-pos).index(slice(regex).to_mbstr[0]) + pos end def rindex(regex, pos=self.length) self.split(//).slice(0, pos).rindex(slice(regex).to_mbstr[0]) end end class String def to_mbstr MultibyteString.new self end end # test_multibyte_string require 'test/unit' require 'multibyte_string' class TestMultibyteString < Test::Unit::TestCase def setup $KCODE = "EUC" @str1 = "あいうえお" @str2 = "abcde" @mbstr1 = MultibyteString.new @str1 @mbstr2 = MultibyteString.new @str2 end def test_slice_nth assert_equal "あ", @mbstr1.slice(0) assert_equal "あ", @mbstr1[0] assert_equal "a", @mbstr2.slice(0) assert_equal "a", @mbstr2[0] end def test_slice_nth_len assert_equal "あい", @mbstr1.slice(0, 2) assert_equal "あい", @mbstr1[0, 2] assert_equal "ab", @mbstr2.slice(0, 2) assert_equal "ab", @mbstr2[0, 2] end def test_slice_range_with_side assert_equal "あいう", @mbstr1.slice(0..2) assert_equal "あいう", @mbstr1[0..2] assert_equal "abc", @mbstr2.slice(0..2) assert_equal "abc", @mbstr2[0..2] end def test_slice_range_without_side assert_equal "あい", @mbstr1.slice(0...2) assert_equal "あい", @mbstr1[0...2] assert_equal "ab", @mbstr2.slice(0...2) assert_equal "ab", @mbstr2[0...2] end def test_slice_substr assert_equal "あい", @mbstr1.slice("あい") assert_equal "あい", @mbstr1["あい"] assert_equal "ab", @mbstr2.slice("ab") assert_equal "ab", @mbstr2["ab"] end def test_slice_regex assert_equal "あい", @mbstr1.slice(/あい/) assert_equal "あい", @mbstr1[/あい/] assert_equal "あいうえ", @mbstr1[/あ.{3}/] assert_equal "ab", @mbstr2.slice(/ab/) assert_equal "ab", @mbstr2[/ab/] assert_equal "abcd", @mbstr2[/a.{3}/] end def test_length assert_equal 5, @mbstr1.length assert_equal 5, @mbstr2.length end def test_size assert_equal 5, @mbstr1.size assert_equal 5, @mbstr2.size end def test_bytes assert_equal 10, @mbstr1.bytes assert_equal 5, @mbstr2.bytes end def test_count assert_equal 1, @mbstr1.count("あ") assert_equal 1, @mbstr2.count("a") end def test_index assert_equal 0, @mbstr1.index("あ"), %Q{"#@mbstr1".index("あ") test} assert_equal 1, @mbstr1.index("い"), %Q{"#@mbstr1".index("い") test} assert_equal 0, @mbstr2.index("a"), %Q{"#@mbstr2".index("a") test} assert_equal 1, @mbstr2.index("b"), %Q{"#@mbstr2".index("b") test} assert_equal 2, @mbstr2.index("c"), %Q{"#@mbstr2".index("c") test} str = "あいうあいうえ".to_mbstr assert_equal 3, str.index("あ", 3), %Q{"#{str}".index("あ", 3) test} assert_equal 4, str.index("い", 3), %Q{"#{str}".index("い", 3) test} str2 = "abcabcd".to_mbstr assert_equal 3, str2.index("a", 3), %Q{"#{str2}".index("a", 3) test} assert_equal 4, str2.index("b", 3), %Q{"#{str2}".index("b", 3) test} end def test_rindex assert_equal 0, @mbstr1.rindex("あ"), %Q{"#@mbstr1".rindex("あ") test} assert_equal 1, @mbstr1.rindex("い"), %Q{"#@mbstr1".rindex("い") test} assert_equal 0, @mbstr2.rindex("a"), %Q{"#@mbstr2".rindex("a") test} assert_equal 1, @mbstr2.rindex("b"), %Q{"#@mbstr2".rindex("b") test} assert_equal 2, @mbstr2.rindex("c"), %Q{"#@mbstr2".rindex("c") test} str = "あいうあいうえ".to_mbstr assert_equal 3, str.rindex("あ"), %Q{"#{str}".rindex("あ") test} assert_equal 4, str.rindex("い"), %Q{"#{str}".rindex("い") test} assert_equal 0, str.rindex("あ", 2), %Q{"#{str}".rindex("あ", 2) test} str2 = "abcabcd".to_mbstr assert_equal 3, str2.rindex("a"), %Q{"#{str2}".rindex("a") test} assert_equal 4, str2.rindex("b"), %Q{"#{str2}".rindex("b") test} assert_equal 0, str2.rindex("a", 2), %Q{"#{str2}".rindex("a", 2) test} end end
既出な気もするけど、こういうのって需要ありますかね。まぁ完成度とか速度とかそういう面ではまだまだなのですが。
ちなみにRDocの書き方と英語が非常にマズい点はお見逃しください。