divide_up_dictionary_range

in lib/twitter_cldr/segmentation/cj_break_engine.rb [46:136]


      def divide_up_dictionary_range(cursor, end_pos, &block)
        return to_enum(__method__, cursor, end_pos) unless block_given?

        input_length = end_pos - cursor.position
        best_snlp = Array.new(input_length + 1) { LARGE_NUMBER }
        prev = Array.new(input_length + 1) { -1 }

        best_snlp[0] = 0
        start_pos = cursor.position
        is_prev_katakana = false

        until cursor.position >= end_pos
          idx = cursor.position - start_pos

          if best_snlp[idx] == LARGE_NUMBER
            cursor.advance
            next
          end

          max_search_length = if cursor.position + MAX_WORD_SIZE < end_pos
            MAX_WORD_SIZE
          else
            end_pos - cursor.position
          end

          count, values, lengths, _ = dictionary.matches(
            cursor, max_search_length, max_search_length
          )

          if (count == 0 || lengths[0] != 1) && !hangul_word_set.include?(cursor.codepoint)
            values[count] = MAX_SNLP
            lengths[count] = 1
            count += 1
          end

          count.times do |j|
            new_snlp = best_snlp[idx] + values[j]

            if new_snlp < best_snlp[lengths[j] + idx]
              best_snlp[lengths[j] + idx] = new_snlp
              prev[lengths[j] + idx] = idx
            end
          end

          
          
          
          
          
          is_katakana = is_katakana?(cursor.codepoint)

          if !is_prev_katakana && is_katakana
            j = idx + 1
            cursor.advance

            while cursor.position < end_pos && (j - idx) < MAX_KATAKANA_GROUP_LENGTH && is_katakana?(cursor.codepoint)
              cursor.advance
              j += 1
            end

            if (j - idx) < MAX_KATAKANA_GROUP_LENGTH
              new_snlp = best_snlp[idx] + get_katakana_cost(j - idx)

              if new_snlp < best_snlp[j]
                best_snlp[j] = new_snlp
                prev[j] = idx
              end
            end
          end

          is_prev_katakana = is_katakana

          cursor.advance
        end

        t_boundary = []

        if best_snlp[input_length] == LARGE_NUMBER
          t_boundary << end_pos
        else
          idx = end_pos - start_pos

          while idx > 0
            t_boundary << idx + start_pos
            idx = prev[idx]
          end
        end

        t_boundary.reverse_each(&block)
      end