lib/twitter_cldr/js/mustache/implementation/parsers/unicode_regex_parser.coffee (143 lines of code) (raw):
# Copyright 2012 Twitter, Inc
# http://www.apache.org/licenses/LICENSE-2.0
class TwitterCldr.UnicodeRegexParser extends TwitterCldr.Parser
constructor : ->
# Types that are allowed to be used in character ranges.
@character_class_token_types = [
"variable", "character_set", "negated_character_set", "unicode_char",
"multichar_string", "string", "escaped_character", "character_range"
]
@negated_token_types = [
"negated_character_set"
]
@binary_operators = [
"pipe", "ampersand", "dash", "union"
]
@unary_operators = [
"negate"
]
parse : (tokens, options = {}) ->
super(@preprocess(@substitute_variables(tokens, options.symbol_table)), options)
make_token : (type, value) ->
new TwitterCldr.Token ({"type": type, "value" : value})
# Identifies regex ranges and makes implicit operators explicit
preprocess : (tokens) ->
result = []
i = 0
while i < tokens.length
# Character class entities side-by-side are treated as unions. So
# are side-by-side character classes. Add a special placeholder token
# to help out the expression parser.
add_union = (@is_valid_character_class_token(result[result.length-1]) and tokens[i].type != "close_bracket") ||
(result[result.length-1]? and result[result.length-1].type == "close_bracket" and tokens[i].type == "open_bracket")
result.push(@make_token("union")) if add_union
is_range = @is_valid_character_class_token(tokens[i]) and
@is_valid_character_class_token(tokens[i + 2]) and
tokens[i + 1].type == "dash"
if is_range
initial = @[tokens[i].type](tokens[i])
final = @[tokens[i+2].type](tokens[i+2])
result.push(@make_character_range(initial, final))
i += 3
else
if @is_negated_token(tokens[i])
result = result.concat [
@make_token("open_bracket")
@make_token("negate")
tokens[i]
@make_token("close_bracket")
]
else
result.push(tokens[i])
i += 1
result
substitute_variables : (tokens, symbol_table) ->
return tokens unless symbol_table?
result = []
for i in [0...tokens.length] by 1
token = tokens[i]
if token.type == "variable" and (sub = symbol_table.fetch(token.value))?
# variables can themselves contain references to other variables
# note: this could be cached somehow
result = result.concat(@substitute_variables(sub, symbol_table))
else
result.push token
result
make_character_range : (initial, final) ->
new TwitterCldr.CharacterRange(initial, final)
is_negated_token : (token) ->
token? and token.type in @negated_token_types
is_valid_character_class_token : (token) ->
token? and token.type in @character_class_token_types
is_unary_operator : (token) ->
token? and token.type in @unary_operators
is_binary_operator : (token) ->
token? and token.type in @binary_operators
do_parse : (options) ->
elements = []
while @current_token()
switch @current_token().type
when "open_bracket"
elements.push(@character_class())
when "union"
@next_token("union")
else
elements.push (@[@current_token().type](@current_token()))
@next_token(@current_token().type)
elements
character_set : (token) ->
new TwitterCldr.CharacterSet(token.value.replace(/^\\p/g, "").replace(/[\{\}\[\]:]/g, ""))
negated_character_set : (token) ->
new TwitterCldr.CharacterSet(token.value.replace(/^\\[pP]/g, "").replace(/[\{\}\[\]:^]/g, ""))
unicode_char : (token) ->
new TwitterCldr.UnicodeString([parseInt(token.value.replace(/^\\u/g, "").replace(/[\{\}]/g, ""), 16)])
string : (token) ->
new TwitterCldr.UnicodeString(TwitterCldr.Utilities.unpack_string(token.value))
multichar_string : (token) ->
new TwitterCldr.UnicodeString(TwitterCldr.Utilities.unpack_string(token.value.replace(/[\{\}]/g, "")))
escaped_character : (token) ->
new TwitterCldr.Literal(token.value)
special_char : (token) ->
new TwitterCldr.Literal(token.value)
negate : (token) ->
@special_char(token)
pipe : (token) ->
@special_char(token)
ampersand : (token) ->
@special_char(token)
# current_token is already a CharacterRange object
character_range : (token) ->
token
character_class : ->
operator_stack = []
operand_stack = []
open_count = 0
while true
if @current_token().type in TwitterCldr.CharacterClass.closing_types()
last_operator = @peek(operator_stack)
open_count -= 1
while last_operator.type isnt TwitterCldr.CharacterClass.opening_type_for(@current_token().type)
operator = operator_stack.pop()
node = if @is_unary_operator(operator)
@unary_operator_node(operator.type, operand_stack.pop())
else
@binary_operator_node(operator.type, operand_stack.pop(), operand_stack.pop())
operand_stack.push(node)
last_operator = @peek(operator_stack)
operator_stack.pop()
else if @current_token().type in TwitterCldr.CharacterClass.opening_types()
open_count += 1
operator_stack.push(@current_token())
else if @current_token().type in @unary_operators.concat(@binary_operators)
operator_stack.push(@current_token())
else
operand_stack.push(@[@current_token().type](@current_token()))
@next_token(@current_token().type)
break if operator_stack.length is 0 and open_count is 0
new TwitterCldr.CharacterClass(operand_stack.pop())
peek : (array) ->
array[array.length-1]
binary_operator_node : (operator, right, left) ->
new TwitterCldr.CharacterClass.BinaryOperator(operator, left, right)
unary_operator_node : (operator, child) ->
new TwitterCldr.CharacterClass.UnaryOperator(operator, child)