spec/tokenizers/unicode_regex/unicode_regex_tokenizer_spec.rb (162 lines of code) (raw):
# encoding: UTF-8
# Copyright 2012 Twitter, Inc
# http://www.apache.org/licenses/LICENSE-2.0
require 'spec_helper'
describe TwitterCldr::Tokenizers::UnicodeRegexTokenizer do
describe "#tokenize" do
let(:tokenizer) { described_class.new }
def tokenize(str)
tokenizer.tokenize(str)
end
it "should tokenize a regular regex" do
got = tokenize("^(ab)xy$")
expected = [
{ value: "^", type: :negate },
{ value: "(", type: :special_char },
{ value: "a", type: :string },
{ value: "b", type: :string },
{ value: ")", type: :special_char },
{ value: "x", type: :string },
{ value: "y", type: :string },
{ value: "$", type: :special_char }
]
check_token_list(got, expected)
end
it "should tokenize a regex containing a basic character class" do
got = tokenize("a[bc]d")
expected = [
{ value: "a", type: :string },
{ value: "[", type: :open_bracket },
{ value: "b", type: :string },
{ value: "c", type: :string },
{ value: "]", type: :close_bracket },
{ value: "d", type: :string }
]
check_token_list(got, expected)
end
it "should tokenize a regex containing unicode character sets" do
got = tokenize("\\p{Zs}[:Lu:]")
expected = [
{ value: "\\p{Zs}", type: :character_set },
{ value: "[:Lu:]", type: :character_set }
]
check_token_list(got, expected)
end
it "should tokenize a regex containing escaped characters" do
got = tokenize("^[a\\b]\\$")
expected = [
{ value: "^", type: :negate },
{ value: "[", type: :open_bracket },
{ value: "a", type: :string },
{ value: "\\b", type: :escaped_character },
{ value: "]", type: :close_bracket },
{ value: "\\$", type: :escaped_character }
]
check_token_list(got, expected)
end
it "should tokenize a regex containing basic character ranges" do
got = tokenize("[a-z0-9]|[ab]")
expected = [
{ value: "[", type: :open_bracket },
{ value: "a", type: :string },
{ value: "-", type: :dash },
{ value: "z", type: :string },
{ value: "0", type: :string },
{ value: "-", type: :dash },
{ value: "9", type: :string },
{ value: "]", type: :close_bracket },
{ value: "|", type: :pipe },
{ value: "[", type: :open_bracket },
{ value: "a", type: :string },
{ value: "b", type: :string },
{ value: "]", type: :close_bracket },
]
check_token_list(got, expected)
end
it "should tokenize a regex containing escaped unicode characters" do
got = tokenize("\\u0020[\\u0123-\\u0155]")
expected = [
{ value: "\\u0020", type: :unicode_char },
{ value: "[", type: :open_bracket },
{ value: "\\u0123", type: :unicode_char },
{ value: "-", type: :dash },
{ value: "\\u0155", type: :unicode_char },
{ value: "]", type: :close_bracket },
]
check_token_list(got, expected)
end
it "should tokenize a regex containing variable substitutions" do
got = tokenize("$CR(?:ab)[$LF]")
expected = [
{ value: "$CR", type: :variable },
{ value: "(", type: :special_char },
{ value: "?", type: :special_char },
{ value: ":", type: :special_char },
{ value: "a", type: :string },
{ value: "b", type: :string },
{ value: ")", type: :special_char },
{ value: "[", type: :open_bracket },
{ value: "$LF", type: :variable },
{ value: "]", type: :close_bracket }
]
check_token_list(got, expected)
end
it "should tokenize a regex containing multichar strings" do
got = tokenize("[{foo}bar]")
expected = [
{ value: "[", type: :open_bracket },
{ value: "{foo}", type: :multichar_string },
{ value: "b", type: :string },
{ value: "a", type: :string },
{ value: "r", type: :string },
{ value: "]", type: :close_bracket }
]
end
it "should tokenize a regex containing negated character sets" do
got = tokenize("[[:^N:]\\P{L}]")
expected = [
{ value: "[", type: :open_bracket },
{ value: "[:^N:]", type: :negated_character_set },
{ value: "\\P{L}", type: :negated_character_set },
{ value: "]", type: :close_bracket }
]
check_token_list(got, expected)
end
it "should tokenize a regex containing some of everything" do
got = tokenize("^[a-zb]?[^[\\p{Z}\\u0020-\\u007f]-[\\P{L}]-[[:N:]\\u0123]][:^CC:]*[{foo}]+$")
expected = [
{ value: "^", type: :negate },
{ value: "[", type: :open_bracket },
{ value: "a", type: :string },
{ value: "-", type: :dash },
{ value: "z", type: :string },
{ value: "b", type: :string },
{ value: "]", type: :close_bracket },
{ value: "?", type: :special_char },
{ value: "[", type: :open_bracket },
{ value: "^", type: :negate },
{ value: "[", type: :open_bracket },
{ value: "\\p{Z}", type: :character_set },
{ value: "\\u0020", type: :unicode_char },
{ value: "-", type: :dash },
{ value: "\\u007f", type: :unicode_char },
{ value: "]", type: :close_bracket },
{ value: "-", type: :dash },
{ value: "[", type: :open_bracket },
{ value: "\\P{L}", type: :negated_character_set },
{ value: "]", type: :close_bracket },
{ value: "-", type: :dash },
{ value: "[", type: :open_bracket },
{ value: "[:N:]", type: :character_set },
{ value: "\\u0123", type: :unicode_char },
{ value: "]", type: :close_bracket },
{ value: "]", type: :close_bracket },
{ value: "[:^CC:]", type: :negated_character_set },
{ value: "*", type: :special_char },
{ value: "[", type: :open_bracket },
{ value: "{foo}", type: :multichar_string },
{ value: "]", type: :close_bracket },
{ value: "+", type: :special_char },
{ value: "$", type: :special_char }
]
check_token_list(got, expected)
end
end
end