in lib/twitter_cldr/segmentation/brahmic_break_engine.rb [55:125]
def divide_up_dictionary_range(cursor, end_pos)
return to_enum(__method__, cursor, end_pos) unless block_given?
return if (end_pos - cursor.position) < min_word_span
state = EngineState.new(
cursor: cursor,
end_pos: end_pos,
words: PossibleWordList.new(lookahead)
)
while cursor.position < end_pos
state.current = cursor.position
state.word_length = 0
candidates = state.words[state.words_found].candidates(
cursor, dictionary, end_pos
)
if candidates == 1
state.word_length = state.words[state.words_found].accept_marked(cursor)
state.words_found += 1
elsif candidates > 1
mark_best_candidate(cursor, end_pos, state)
state.word_length = state.words[state.words_found].accept_marked(cursor)
state.words_found += 1
end
if cursor.position < end_pos && state.word_length < root_combine_threshold
preceeding_words = state.words[state.words_found].candidates(
cursor, dictionary, end_pos
)
if preceeding_words <= 0 && (state.word_length == 0 || state.words[state.words_found].longest_prefix < prefix_combine_threshold)
advance_to_plausible_word_boundary(cursor, end_pos, state)
else
cursor.position = state.current + state.word_length
end
end
while cursor.position < end_pos && mark_set.include?(cursor.codepoint)
cursor.advance
state.word_length += 1
end
state.word_length += advance_past_suffix.call(
cursor, end_pos, state
)
if state.word_length > 0
yield state.current + state.word_length
end
end
end