spec/segmentation/rule_set_spec.rb (114 lines of code) (raw):
# encoding: UTF-8
# Copyright 2012 Twitter, Inc
# http://www.apache.org/licenses/LICENSE-2.0
require 'spec_helper'
describe TwitterCldr::Segmentation::RuleSet do
let(:cursor) { TwitterCldr::Segmentation::Cursor }
let(:skip_cases) { [] }
let(:test_path) do
File.join(
TwitterCldr::RESOURCES_DIR, 'shared', 'segments', 'tests'
)
end
def parse(test_data)
parts = test_data
.split(/([÷×])/)
.map(&:strip)
.reject(&:empty?)
.map do |part|
if part =~ /[÷×]/
part
else
[part.to_i(16)].pack('U*')
end
end
end
def boundaries(test_parts)
idx = 0
[].tap do |boundaries|
test_parts.each do |part|
if part =~ /[÷×]/
boundaries << idx if part == '÷'
else
idx += 1
end
end
end
end
def string(test_parts)
test_parts.select.with_index do |part, idx|
idx % 2 == 1
end.join
end
def error_messages(failures)
messages = failures.map do |failure|
error_message(
failure[:test],
failure[:test_case_boundaries],
failure[:result_boundaries],
failure[:icu_boundaries]
)
end
<<END
Expected boundaries to match test cases
#{messages.join("\n")}
END
end
def error_message(test, test_case_boundaries, result_boundaries, icu_boundaries)
<<END
test case: #{test}
conformance boundaries: #{test_case_boundaries.inspect}
ICU boundaries: #{icu_boundaries.inspect}
actual boundaries: #{result_boundaries.inspect}
END
end
shared_examples 'a conformant implementation' do
it 'passes all Unicode test cases, falling back to matching ICU test results' do
failures = test_data.each_with_object([]).with_index do |(test, memo), idx|
test_parts = parse(test)
test_case_boundaries = boundaries(test_parts)
test_case_string = string(test_parts)
result_boundaries = iterator.each_boundary(test_case_string).to_a
passed_conformance_test = result_boundaries == test_case_boundaries
produced_same_results_as_icu = result_boundaries == icu_test_results[idx]
if !passed_conformance_test && !produced_same_results_as_icu
memo << {
test: test,
result_boundaries: result_boundaries,
test_case_boundaries: test_case_boundaries,
icu_boundaries: icu_test_results[idx]
}
end
end
expect(failures).to be_empty, error_messages(failures)
end
end
describe 'word boundaries' do
let(:test_file) { File.join(test_path, 'word_break_test.yml') }
let(:icu_test_results_file) { File.join(test_path, 'icu_word_break_test_results.yml') }
let(:test_data) { YAML.load_file(test_file) }
let(:icu_test_results) { YAML.load_file(icu_test_results_file) }
let(:iterator) { TwitterCldr::Segmentation::BreakIterator.iterator_for('word') }
it_behaves_like 'a conformant implementation'
end
describe 'sentence boundaries' do
let(:test_file) { File.join(test_path, 'sentence_break_test.yml') }
let(:icu_test_results_file) { File.join(test_path, 'icu_sentence_break_test_results.yml') }
let(:test_data) { YAML.load_file(test_file) }
let(:icu_test_results) { YAML.load_file(icu_test_results_file) }
let(:iterator) { TwitterCldr::Segmentation::BreakIterator.iterator_for('sentence') }
it_behaves_like 'a conformant implementation'
end
describe 'grapheme boundaries' do
let(:test_file) { File.join(test_path, 'grapheme_break_test.yml') }
let(:icu_test_results_file) { File.join(test_path, 'icu_grapheme_break_test_results.yml') }
let(:test_data) { YAML.load_file(test_file) }
let(:icu_test_results) { YAML.load_file(icu_test_results_file) }
let(:iterator) { TwitterCldr::Segmentation::BreakIterator.iterator_for('grapheme') }
it_behaves_like 'a conformant implementation'
end
describe 'line boundaries' do
let(:test_file) { File.join(test_path, 'line_break_test.yml') }
let(:icu_test_results_file) { File.join(test_path, 'icu_line_break_test_results.yml') }
let(:test_data) { YAML.load_file(test_file) }
let(:icu_test_results) { YAML.load_file(icu_test_results_file) }
let(:iterator) { TwitterCldr::Segmentation::BreakIterator.iterator_for('line') }
it_behaves_like 'a conformant implementation'
end
end