run_conformance_tests_with_icu

in lib/twitter_cldr/resources/segment_tests_importer.rb [126:170]


      def run_conformance_tests_with_icu(conformance_file, test_lines)
        boundary_type = case File.basename(conformance_file)
          when 'WordBreakTest.txt'
            :word
          when 'SentenceBreakTest.txt'
            :sentence
          when 'GraphemeBreakTest.txt'
            :grapheme
          when 'LineBreakTest.txt'
            :line
        end

        test_lines.map do |test_line|
          test_codepoints = test_line
            .split(/[÷×]/)
            .map(&:strip)
            .reject(&:empty?)
            .map { |cp| cp.to_i(16) }

          utf_16_pos = 0

          
          
          
          
          
          
          
          logical_position_map = test_codepoints.each_with_object({}).with_index do |(cp, memo), idx|
            memo[utf_16_pos] = idx

            
            
            utf_16_pos += ([cp].pack('U*').encode(Encoding::UTF_16).bytesize - 2) / 2
          end

          logical_position_map[utf_16_pos] = logical_position_map.size
          test_str = test_codepoints.pack('U*')

          boundaries = collect_boundaries(test_str, boundary_type).map do |boundary|
            logical_position_map[boundary]
          end
        end
      end