in lib/twitter_cldr/segmentation/cj_break_engine.rb [46:136]
def divide_up_dictionary_range(cursor, end_pos, &block)
return to_enum(__method__, cursor, end_pos) unless block_given?
input_length = end_pos - cursor.position
best_snlp = Array.new(input_length + 1) { LARGE_NUMBER }
prev = Array.new(input_length + 1) { -1 }
best_snlp[0] = 0
start_pos = cursor.position
is_prev_katakana = false
until cursor.position >= end_pos
idx = cursor.position - start_pos
if best_snlp[idx] == LARGE_NUMBER
cursor.advance
next
end
max_search_length = if cursor.position + MAX_WORD_SIZE < end_pos
MAX_WORD_SIZE
else
end_pos - cursor.position
end
count, values, lengths, _ = dictionary.matches(
cursor, max_search_length, max_search_length
)
if (count == 0 || lengths[0] != 1) && !hangul_word_set.include?(cursor.codepoint)
values[count] = MAX_SNLP
lengths[count] = 1
count += 1
end
count.times do |j|
new_snlp = best_snlp[idx] + values[j]
if new_snlp < best_snlp[lengths[j] + idx]
best_snlp[lengths[j] + idx] = new_snlp
prev[lengths[j] + idx] = idx
end
end
is_katakana = is_katakana?(cursor.codepoint)
if !is_prev_katakana && is_katakana
j = idx + 1
cursor.advance
while cursor.position < end_pos && (j - idx) < MAX_KATAKANA_GROUP_LENGTH && is_katakana?(cursor.codepoint)
cursor.advance
j += 1
end
if (j - idx) < MAX_KATAKANA_GROUP_LENGTH
new_snlp = best_snlp[idx] + get_katakana_cost(j - idx)
if new_snlp < best_snlp[j]
best_snlp[j] = new_snlp
prev[j] = idx
end
end
end
is_prev_katakana = is_katakana
cursor.advance
end
t_boundary = []
if best_snlp[input_length] == LARGE_NUMBER
t_boundary << end_pos
else
idx = end_pos - start_pos
while idx > 0
t_boundary << idx + start_pos
idx = prev[idx]
end
end
t_boundary.reverse_each(&block)
end