in lib/twitter_cldr/resources/segment_tests_importer.rb [126:170]
def run_conformance_tests_with_icu(conformance_file, test_lines)
boundary_type = case File.basename(conformance_file)
when 'WordBreakTest.txt'
:word
when 'SentenceBreakTest.txt'
:sentence
when 'GraphemeBreakTest.txt'
:grapheme
when 'LineBreakTest.txt'
:line
end
test_lines.map do |test_line|
test_codepoints = test_line
.split(/[÷×]/)
.map(&:strip)
.reject(&:empty?)
.map { |cp| cp.to_i(16) }
utf_16_pos = 0
logical_position_map = test_codepoints.each_with_object({}).with_index do |(cp, memo), idx|
memo[utf_16_pos] = idx
utf_16_pos += ([cp].pack('U*').encode(Encoding::UTF_16).bytesize - 2) / 2
end
logical_position_map[utf_16_pos] = logical_position_map.size
test_str = test_codepoints.pack('U*')
boundaries = collect_boundaries(test_str, boundary_type).map do |boundary|
logical_position_map[boundary]
end
end
end