読者です 読者をやめる 読者になる 読者になる

文字列をマルチバイト文字を意識せずに扱えるようなラッパークラス

ruby

まだ作業途中のネタなのだけど、Rubyレシピブックを見てみるとマルチバイト文字列の処理にはいろいろとバッドノウハウを覚えないといけないくさいのでStringクラスのインスタンスをMultibyteStringクラスに変換することでマルチバイト文字を意識せず扱えるクラスを定義してみたのだけどどうだろうか。

# multibyte_string.rb
=begin
=VERSION=
Multibyte String ver 0.0.1

This library is String good wrapper for string which include multibyte characters.
Multibyte String means string which include multibyte characters.
If you use this, you can use it as just a string. 
You need not use string contiously whether it has multibyte characters or not.

=SYNOPOS=

$KCODE = "EUC"
require 'multybyte_string'

str = "あいうえお".to_mbstr # create MultibyteString instance from String.
str[1] #=> "い"
str.size #=> 5
str.index("う") #=> 2

=end

require 'delegate'

#=Warning=
# if $KCODE is "NONE", you cannot use this library correctly.
# Please set $KCODE to multibyte character supportted mode. 
# (for example, "UTF-8", "EUC", "S_JIS")
if $KCODE == "NONE"
  warn "$KCODE was not setted. Please set $KCODE."
end

class MultibyteString < DelegateClass(String)
  def initialize(str)
    super str
  end

  def slice(nth, len=nil)
    case nth
    when Regexp
      if nth =~ self 
        $~.to_s
      else
        return nil
      end
    else
      if len.nil?
        case nth
        when Fixnum
          self.split(//)[nth]
        when Range
          self.split(//)[nth].join
        when String
          if self.include? nth
            nth
          else
            nil
          end
        end
      else
        self.split(//)[nth, len].join
      end
    end
  end

  alias [] slice

  def []=(nth, val)
  end

  def slice!(nth, len=nil)
  end
  
  alias bytes size

  # Return length of string. To know about string's byte length,
  # use MultibyteString#bytes.
  def length
    self.split(//).size
  end

  alias size length

  def count(str)
    counter = 0
    self.scan(str) do
      counter += 1
    end
    counter
  end

  def index(regex, pos=0) 
    self.split(//).slice(pos, self.length-pos).index(slice(regex).to_mbstr[0]) + pos
  end

  def rindex(regex, pos=self.length)
    self.split(//).slice(0, pos).rindex(slice(regex).to_mbstr[0])
  end
end

class String
  def to_mbstr
    MultibyteString.new self
  end
end

# test_multibyte_string
require 'test/unit'
require 'multibyte_string'

class TestMultibyteString < Test::Unit::TestCase
  def setup
    $KCODE = "EUC"
    @str1 = "あいうえお"
    @str2 = "abcde"
    @mbstr1 = MultibyteString.new @str1
    @mbstr2 = MultibyteString.new @str2
  end
 
  def test_slice_nth
    assert_equal "あ", @mbstr1.slice(0)
    assert_equal "あ", @mbstr1[0]
    assert_equal "a", @mbstr2.slice(0)
    assert_equal "a", @mbstr2[0]
  end

  def test_slice_nth_len
    assert_equal "あい", @mbstr1.slice(0, 2)
    assert_equal "あい", @mbstr1[0, 2]
    assert_equal "ab", @mbstr2.slice(0, 2)
    assert_equal "ab", @mbstr2[0, 2]
  end 
  
  def test_slice_range_with_side
    assert_equal "あいう", @mbstr1.slice(0..2)
    assert_equal "あいう", @mbstr1[0..2]
    assert_equal "abc", @mbstr2.slice(0..2)
    assert_equal "abc", @mbstr2[0..2]
  end

  def test_slice_range_without_side
    assert_equal "あい", @mbstr1.slice(0...2)
    assert_equal "あい", @mbstr1[0...2]
    assert_equal "ab", @mbstr2.slice(0...2)
    assert_equal "ab", @mbstr2[0...2]
  end

  def test_slice_substr
    assert_equal "あい", @mbstr1.slice("あい")
    assert_equal "あい", @mbstr1["あい"]
    assert_equal "ab", @mbstr2.slice("ab")
    assert_equal "ab", @mbstr2["ab"]
  end

  def test_slice_regex
    assert_equal "あい", @mbstr1.slice(/あい/)
    assert_equal "あい", @mbstr1[/あい/]
    assert_equal "あいうえ", @mbstr1[/あ.{3}/]
    assert_equal "ab", @mbstr2.slice(/ab/)
    assert_equal "ab", @mbstr2[/ab/]
    assert_equal "abcd", @mbstr2[/a.{3}/]
  end

  def test_length
    assert_equal 5, @mbstr1.length
    assert_equal 5, @mbstr2.length
  end

  def test_size
    assert_equal 5, @mbstr1.size
    assert_equal 5, @mbstr2.size
  end

  def test_bytes
    assert_equal 10, @mbstr1.bytes
    assert_equal 5, @mbstr2.bytes
  end

  def test_count
    assert_equal 1, @mbstr1.count("あ") 
    assert_equal 1, @mbstr2.count("a") 
  end

  def test_index
    assert_equal 0, @mbstr1.index("あ"), %Q{"#@mbstr1".index("あ") test}
    assert_equal 1, @mbstr1.index("い"), %Q{"#@mbstr1".index("い") test}
    assert_equal 0, @mbstr2.index("a"), %Q{"#@mbstr2".index("a") test}
    assert_equal 1, @mbstr2.index("b"), %Q{"#@mbstr2".index("b") test}
    assert_equal 2, @mbstr2.index("c"), %Q{"#@mbstr2".index("c") test}
    str = "あいうあいうえ".to_mbstr
    assert_equal 3, str.index("あ", 3), %Q{"#{str}".index("あ", 3) test}
    assert_equal 4, str.index("い", 3), %Q{"#{str}".index("い", 3) test}
    str2 = "abcabcd".to_mbstr
    assert_equal 3, str2.index("a", 3), %Q{"#{str2}".index("a", 3) test}
    assert_equal 4, str2.index("b", 3), %Q{"#{str2}".index("b", 3) test}
  end

  def test_rindex
    assert_equal 0, @mbstr1.rindex("あ"), %Q{"#@mbstr1".rindex("あ") test}
    assert_equal 1, @mbstr1.rindex("い"), %Q{"#@mbstr1".rindex("い") test}
    assert_equal 0, @mbstr2.rindex("a"), %Q{"#@mbstr2".rindex("a") test}
    assert_equal 1, @mbstr2.rindex("b"), %Q{"#@mbstr2".rindex("b") test}
    assert_equal 2, @mbstr2.rindex("c"), %Q{"#@mbstr2".rindex("c") test}
    str = "あいうあいうえ".to_mbstr
    assert_equal 3, str.rindex("あ"), %Q{"#{str}".rindex("あ") test}
    assert_equal 4, str.rindex("い"), %Q{"#{str}".rindex("い") test}
    assert_equal 0, str.rindex("あ", 2), %Q{"#{str}".rindex("あ", 2) test}
    str2 = "abcabcd".to_mbstr
    assert_equal 3, str2.rindex("a"), %Q{"#{str2}".rindex("a") test}
    assert_equal 4, str2.rindex("b"), %Q{"#{str2}".rindex("b") test}
    assert_equal 0, str2.rindex("a", 2), %Q{"#{str2}".rindex("a", 2) test}
  end
end

既出な気もするけど、こういうのって需要ありますかね。まぁ完成度とか速度とかそういう面ではまだまだなのですが。

ちなみにRDocの書き方と英語が非常にマズい点はお見逃しください。