objc/lib/TwitterText.m (1,155 lines of code) (raw):
// Copyright 2018 Twitter, Inc.
// Licensed under the Apache License, Version 2.0
// http://www.apache.org/licenses/LICENSE-2.0
//
// TwitterText.m
//
#import "NSURL+IFUnicodeURL.h"
#import "TwitterText.h"
#import "TwitterTextEmoji.h"
#pragma mark - Regular Expressions
//
// These regular expressions are ported from twitter-text-rb on Apr 24 2012.
//
#define TWUControlCharacters @"\\u0009-\\u000D"
#define TWUSpace @"\\u0020"
#define TWUControl85 @"\\u0085"
#define TWUNoBreakSpace @"\\u00A0"
#define TWUOghamBreakSpace @"\\u1680"
#define TWUMongolianVowelSeparator @"\\u180E"
#define TWUWhiteSpaces @"\\u2000-\\u200A"
#define TWULineSeparator @"\\u2028"
#define TWUParagraphSeparator @"\\u2029"
#define TWUNarrowNoBreakSpace @"\\u202F"
#define TWUMediumMathematicalSpace @"\\u205F"
#define TWUIdeographicSpace @"\\u3000"
#define TWUUnicodeSpaces \
TWUControlCharacters \
TWUSpace \
TWUControl85 \
TWUNoBreakSpace \
TWUOghamBreakSpace \
TWUMongolianVowelSeparator \
TWUWhiteSpaces \
TWULineSeparator \
TWUParagraphSeparator \
TWUNarrowNoBreakSpace \
TWUMediumMathematicalSpace \
TWUIdeographicSpace
#define TWUUnicodeALM @"\\u061C"
#define TWUUnicodeLRM @"\\u200E"
#define TWUUnicodeRLM @"\\u200F"
#define TWUUnicodeLRE @"\\u202A"
#define TWUUnicodeRLE @"\\u202B"
#define TWUUnicodePDF @"\\u202C"
#define TWUUnicodeLRO @"\\u202D"
#define TWUUnicodeRLO @"\\u202E"
#define TWUUnicodeLRI @"\\u2066"
#define TWUUnicodeRLI @"\\u2067"
#define TWUUnicodeFSI @"\\u2068"
#define TWUUnicodePDI @"\\u2069"
#define TWUUnicodeDirectionalCharacters \
TWUUnicodeALM \
TWUUnicodeLRM \
TWUUnicodeRLM \
TWUUnicodeLRE \
TWUUnicodeRLE \
TWUUnicodePDF \
TWUUnicodeLRO \
TWUUnicodeRLO \
TWUUnicodeLRI \
TWUUnicodeRLI \
TWUUnicodeFSI \
TWUUnicodePDI
#define TWUInvalidCharacters @"\\uFFFE\\uFEFF\\uFFFF"
#define TWUInvalidCharactersPattern @"[" TWUInvalidCharacters @"]"
#define TWULatinAccents \
@"\\u00C0-\\u00D6\\u00D8-\\u00F6\\u00F8-\\u00FF\\u0100-\\u024F\\u0253-\\u0254\\u0256-\\u0257\\u0259\\u025b\\u0263\\u0268\\u026F\\u0272\\u0289\\u02BB\\u1E00-\\u1EFF"
//
// Hashtag
//
#define TWUPunctuationChars @"-_!\"#$%&'\\(\\)*+,./:;<=>?@\\[\\]^`\\{|}~"
#define TWUPunctuationCharsWithoutHyphen @"_!\"#$%&'\\(\\)*+,./:;<=>?@\\[\\]^`\\{|}~"
#define TWUPunctuationCharsWithoutHyphenAndUnderscore @"!\"#$%&'\\(\\)*+,./:;<=>?@\\[\\]^`\\{|}~"
#define TWHashtagAlpha @"[\\p{L}\\p{M}]"
#define TWHashtagSpecialChars @"_\\u200c\\u200d\\ua67e\\u05be\\u05f3\\u05f4\\uff5e\\u301c\\u309b\\u309c\\u30a0\\u30fb\\u3003\\u0f0b\\u0f0c\\u00b7"
#define TWUHashtagAlphanumeric @"[\\p{L}\\p{M}\\p{Nd}" TWHashtagSpecialChars @"]"
#define TWUHashtagBoundaryInvalidChars @"&\\p{L}\\p{M}\\p{Nd}" TWHashtagSpecialChars
#define TWUHashtagBoundary \
@"^|\\ufe0e|\\ufe0f|$|[^" \
TWUHashtagBoundaryInvalidChars \
@"]"
#define TWUValidHashtag \
@"(?:" TWUHashtagBoundary @")([##](?!\ufe0f|\u20e3)" TWUHashtagAlphanumeric @"*" TWHashtagAlpha TWUHashtagAlphanumeric @"*)"
#define TWUEndHashTagMatch @"\\A(?:[##]|://)"
//
// Symbol (Cashtag)
//
#define TWUSymbol @"[a-z]{1,6}(?:[._][a-z]{1,2})?"
#define TWUValidSymbol \
@"(?:^|[" TWUUnicodeSpaces TWUUnicodeDirectionalCharacters @"])" \
@"(\\$" TWUSymbol @")" \
@"(?=$|\\s|[" TWUPunctuationChars @"])"
//
// Mention and list name
//
#define TWUValidMentionPrecedingChars @"(?:[^a-z0-9_!#$%&*@@]|^|(?:^|[^a-z0-9_+~.-])RT:?)"
#define TWUAtSigns @"[@@]"
#define TWUValidUsername @"\\A" TWUAtSigns @"[a-z0-9_]{1,20}\\z"
#define TWUValidList @"\\A" TWUAtSigns @"[a-z0-9_]{1,20}/[a-z][a-z0-9_\\-]{0,24}\\z"
#define TWUValidMentionOrList \
@"(" TWUValidMentionPrecedingChars @")" \
@"(" TWUAtSigns @")" \
@"([a-z0-9_]{1,20})" \
@"(/[a-z][a-z0-9_\\-]{0,24})?"
#define TWUValidReply @"\\A(?:[" TWUUnicodeSpaces TWUUnicodeDirectionalCharacters @"])*" TWUAtSigns @"([a-z0-9_]{1,20})"
#define TWUEndMentionMatch @"\\A(?:" TWUAtSigns @"|[" TWULatinAccents @"]|://)"
//
// URL
//
#define TWUValidURLPrecedingChars @"(?:[^a-z0-9@@$##" TWUInvalidCharacters @"]|[" TWUUnicodeDirectionalCharacters "]|^)"
// These patterns extract domains that are ascii+latin only. We separately check
// for unencoded domains with unicode characters elsewhere.
#define TWUValidURLCharacters @"[a-z0-9" TWULatinAccents @"]"
#define TWUValidURLSubdomain @"(?>(?:" TWUValidURLCharacters @"[" TWUValidURLCharacters @"\\-_]{0,255})?" TWUValidURLCharacters @"\\.)"
#define TWUValidURLDomain @"(?:(?:" TWUValidURLCharacters @"[" TWUValidURLCharacters @"\\-]{0,255})?" TWUValidURLCharacters @"\\.)"
// Used to extract domains that contain unencoded unicode.
#define TWUValidURLUnicodeCharacters \
@"[^" \
TWUPunctuationChars \
@"\\s\\p{Z}\\p{InGeneralPunctuation}" \
@"]"
#define TWUValidURLUnicodeDomain @"(?:(?:" TWUValidURLUnicodeCharacters @"[" TWUValidURLUnicodeCharacters @"\\-]{0,255})?" TWUValidURLUnicodeCharacters @"\\.)"
#define TWUValidPunycode @"(?:xn--[-0-9a-z]+)"
#define TWUValidDomain \
@"(?:" \
TWUValidURLSubdomain @"*" TWUValidURLDomain \
@"(?:" TWUValidGTLD @"|" TWUValidCCTLD @"|" TWUValidPunycode @")" \
@")" \
@"|(?:(?<=https?://)" \
@"(?:" \
@"(?:" TWUValidURLDomain TWUValidCCTLD @")" \
@"|(?:" \
TWUValidURLUnicodeDomain @"{0,255}" TWUValidURLUnicodeDomain \
@"(?:" TWUValidGTLD @"|" TWUValidCCTLD @")" \
@")" \
@")" \
@")" \
@"|(?:" \
TWUValidURLDomain TWUValidCCTLD @"(?=/)" \
@")"
#define TWUValidPortNumber @"[0-9]++"
#define TWUValidGeneralURLPathChars @"[a-z\\p{Cyrillic}0-9!\\*';:=+,.$/%#\\[\\]\\-\\u2013_~&|@" TWULatinAccents @"]"
#define TWUValidURLBalancedParens \
@"\\(" \
@"(?:" \
TWUValidGeneralURLPathChars @"+" \
@"|" \
@"(?:" \
TWUValidGeneralURLPathChars @"*" \
@"\\(" \
TWUValidGeneralURLPathChars @"+" \
@"\\)" \
TWUValidGeneralURLPathChars @"*" \
@")" \
@")" \
@"\\)"
#define TWUValidURLPathEndingChars @"[a-z\\p{Cyrillic}0-9=_#/+\\-" TWULatinAccents @"]|(?:" TWUValidURLBalancedParens @")"
#define TWUValidPath @"(?:" \
@"(?:" \
TWUValidGeneralURLPathChars @"*" \
@"(?:" TWUValidURLBalancedParens TWUValidGeneralURLPathChars @"*)*" \
TWUValidURLPathEndingChars \
@")|(?:@" TWUValidGeneralURLPathChars @"+/)" \
@")"
#define TWUValidURLQueryChars @"[a-z0-9!?*'\\(\\);:&=+$/%#\\[\\]\\-_\\.,~|@]"
#define TWUValidURLQueryEndingChars @"[a-z0-9\\-_&=#/]"
#define TWUValidURLPatternString \
@"(" \
@"(" TWUValidURLPrecedingChars @")" \
@"(" \
@"(https?://)?" \
@"(" TWUValidDomain @")" \
@"(?::(" TWUValidPortNumber @"))?" \
@"(/" \
TWUValidPath @"*+" \
@")?" \
@"(\\?" TWUValidURLQueryChars @"*" \
TWUValidURLQueryEndingChars @")?" \
@")" \
@")"
typedef NS_ENUM(NSInteger, TWUValidURLGroup) {
TWUValidURLGroupAll = 1,
TWUValidURLGroupPreceding,
TWUValidURLGroupURL,
TWUValidURLGroupProtocol,
TWUValidURLGroupDomain,
TWUValidURLGroupPort,
TWUValidURLGroupPath,
TWUValidURLGroupQueryString
};
#define TWUValidGTLD \
@"(?:(?:" \
@"삼성|닷컴|닷넷|香格里拉|餐厅|食品|飞利浦|電訊盈科|集团|通販|购物|谷歌|诺基亚|联通|网络|网站|网店|网址|组织机构|移动|珠宝|点看|游戏|淡马锡|机构|書籍|时尚|新闻|政府|政务|" \
@"招聘|手表|手机|我爱你|慈善|微博|广东|工行|家電|娱乐|天主教|大拿|大众汽车|在线|嘉里大酒店|嘉里|商标|商店|商城|公益|公司|八卦|健康|信息|佛山|企业|中文网|中信|世界|ポイント|" \
@"ファッション|セール|ストア|コム|グーグル|クラウド|みんな|คอม|संगठन|नेट|कॉम|همراه|موقع|موبايلي|كوم|كاثوليك|عرب|شبكة|بيتك|بازار|" \
@"العليان|ارامكو|اتصالات|ابوظبي|קום|сайт|рус|орг|онлайн|москва|ком|католик|дети|zuerich|zone|zippo|zip|" \
@"zero|zara|zappos|yun|youtube|you|yokohama|yoga|yodobashi|yandex|yamaxun|yahoo|yachts|xyz|xxx|xperia|" \
@"xin|xihuan|xfinity|xerox|xbox|wtf|wtc|wow|world|works|work|woodside|wolterskluwer|wme|winners|wine|" \
@"windows|win|williamhill|wiki|wien|whoswho|weir|weibo|wedding|wed|website|weber|webcam|weatherchannel|" \
@"weather|watches|watch|warman|wanggou|wang|walter|walmart|wales|vuelos|voyage|voto|voting|vote|volvo|" \
@"volkswagen|vodka|vlaanderen|vivo|viva|vistaprint|vista|vision|visa|virgin|vip|vin|villas|viking|vig|" \
@"video|viajes|vet|versicherung|vermögensberatung|vermögensberater|verisign|ventures|vegas|vanguard|" \
@"vana|vacations|ups|uol|uno|university|unicom|uconnect|ubs|ubank|tvs|tushu|tunes|tui|tube|trv|trust|" \
@"travelersinsurance|travelers|travelchannel|travel|training|trading|trade|toys|toyota|town|tours|" \
@"total|toshiba|toray|top|tools|tokyo|today|tmall|tkmaxx|tjx|tjmaxx|tirol|tires|tips|tiffany|tienda|" \
@"tickets|tiaa|theatre|theater|thd|teva|tennis|temasek|telefonica|telecity|tel|technology|tech|team|" \
@"tdk|tci|taxi|tax|tattoo|tatar|tatamotors|target|taobao|talk|taipei|tab|systems|symantec|sydney|swiss|" \
@"swiftcover|swatch|suzuki|surgery|surf|support|supply|supplies|sucks|style|study|studio|stream|store|" \
@"storage|stockholm|stcgroup|stc|statoil|statefarm|statebank|starhub|star|staples|stada|srt|srl|" \
@"spreadbetting|spot|sport|spiegel|space|soy|sony|song|solutions|solar|sohu|software|softbank|social|" \
@"soccer|sncf|smile|smart|sling|skype|sky|skin|ski|site|singles|sina|silk|shriram|showtime|show|shouji|" \
@"shopping|shop|shoes|shiksha|shia|shell|shaw|sharp|shangrila|sfr|sexy|sex|sew|seven|ses|services|" \
@"sener|select|seek|security|secure|seat|search|scot|scor|scjohnson|science|schwarz|schule|school|" \
@"scholarships|schmidt|schaeffler|scb|sca|sbs|sbi|saxo|save|sas|sarl|sapo|sap|sanofi|sandvikcoromant|" \
@"sandvik|samsung|samsclub|salon|sale|sakura|safety|safe|saarland|ryukyu|rwe|run|ruhr|rugby|rsvp|room|" \
@"rogers|rodeo|rocks|rocher|rmit|rip|rio|ril|rightathome|ricoh|richardli|rich|rexroth|reviews|review|" \
@"restaurant|rest|republican|report|repair|rentals|rent|ren|reliance|reit|reisen|reise|rehab|" \
@"redumbrella|redstone|red|recipes|realty|realtor|realestate|read|raid|radio|racing|qvc|quest|quebec|" \
@"qpon|pwc|pub|prudential|pru|protection|property|properties|promo|progressive|prof|productions|prod|" \
@"pro|prime|press|praxi|pramerica|post|porn|politie|poker|pohl|pnc|plus|plumbing|playstation|play|" \
@"place|pizza|pioneer|pink|ping|pin|pid|pictures|pictet|pics|piaget|physio|photos|photography|photo|" \
@"phone|philips|phd|pharmacy|pfizer|pet|pccw|pay|passagens|party|parts|partners|pars|paris|panerai|" \
@"panasonic|pamperedchef|page|ovh|ott|otsuka|osaka|origins|orientexpress|organic|org|orange|oracle|" \
@"open|ooo|onyourside|online|onl|ong|one|omega|ollo|oldnavy|olayangroup|olayan|okinawa|office|off|" \
@"observer|obi|nyc|ntt|nrw|nra|nowtv|nowruz|now|norton|northwesternmutual|nokia|nissay|nissan|ninja|" \
@"nikon|nike|nico|nhk|ngo|nfl|nexus|nextdirect|next|news|newholland|new|neustar|network|netflix|" \
@"netbank|net|nec|nba|navy|natura|nationwide|name|nagoya|nadex|nab|mutuelle|mutual|museum|mtr|mtpc|mtn|" \
@"msd|movistar|movie|mov|motorcycles|moto|moscow|mortgage|mormon|mopar|montblanc|monster|money|monash|" \
@"mom|moi|moe|moda|mobily|mobile|mobi|mma|mls|mlb|mitsubishi|mit|mint|mini|mil|microsoft|miami|metlife|" \
@"merckmsd|meo|menu|men|memorial|meme|melbourne|meet|media|med|mckinsey|mcdonalds|mcd|mba|mattel|" \
@"maserati|marshalls|marriott|markets|marketing|market|map|mango|management|man|makeup|maison|maif|" \
@"madrid|macys|luxury|luxe|lupin|lundbeck|ltda|ltd|lplfinancial|lpl|love|lotto|lotte|london|lol|loft|" \
@"locus|locker|loans|loan|llp|llc|lixil|living|live|lipsy|link|linde|lincoln|limo|limited|lilly|like|" \
@"lighting|lifestyle|lifeinsurance|life|lidl|liaison|lgbt|lexus|lego|legal|lefrak|leclerc|lease|lds|" \
@"lawyer|law|latrobe|latino|lat|lasalle|lanxess|landrover|land|lancome|lancia|lancaster|lamer|" \
@"lamborghini|ladbrokes|lacaixa|kyoto|kuokgroup|kred|krd|kpn|kpmg|kosher|komatsu|koeln|kiwi|kitchen|" \
@"kindle|kinder|kim|kia|kfh|kerryproperties|kerrylogistics|kerryhotels|kddi|kaufen|juniper|juegos|jprs|" \
@"jpmorgan|joy|jot|joburg|jobs|jnj|jmp|jll|jlc|jio|jewelry|jetzt|jeep|jcp|jcb|java|jaguar|iwc|iveco|" \
@"itv|itau|istanbul|ist|ismaili|iselect|irish|ipiranga|investments|intuit|international|intel|int|" \
@"insure|insurance|institute|ink|ing|info|infiniti|industries|inc|immobilien|immo|imdb|imamat|ikano|" \
@"iinet|ifm|ieee|icu|ice|icbc|ibm|hyundai|hyatt|hughes|htc|hsbc|how|house|hotmail|hotels|hoteles|hot|" \
@"hosting|host|hospital|horse|honeywell|honda|homesense|homes|homegoods|homedepot|holiday|holdings|" \
@"hockey|hkt|hiv|hitachi|hisamitsu|hiphop|hgtv|hermes|here|helsinki|help|healthcare|health|hdfcbank|" \
@"hdfc|hbo|haus|hangout|hamburg|hair|guru|guitars|guide|guge|gucci|guardian|group|grocery|gripe|green|" \
@"gratis|graphics|grainger|gov|got|gop|google|goog|goodyear|goodhands|goo|golf|goldpoint|gold|godaddy|" \
@"gmx|gmo|gmbh|gmail|globo|global|gle|glass|glade|giving|gives|gifts|gift|ggee|george|genting|gent|gea|" \
@"gdn|gbiz|gay|garden|gap|games|game|gallup|gallo|gallery|gal|fyi|futbol|furniture|fund|fun|fujixerox|" \
@"fujitsu|ftr|frontier|frontdoor|frogans|frl|fresenius|free|fox|foundation|forum|forsale|forex|ford|" \
@"football|foodnetwork|food|foo|fly|flsmidth|flowers|florist|flir|flights|flickr|fitness|fit|fishing|" \
@"fish|firmdale|firestone|fire|financial|finance|final|film|fido|fidelity|fiat|ferrero|ferrari|" \
@"feedback|fedex|fast|fashion|farmers|farm|fans|fan|family|faith|fairwinds|fail|fage|extraspace|" \
@"express|exposed|expert|exchange|everbank|events|eus|eurovision|etisalat|esurance|estate|esq|erni|" \
@"ericsson|equipment|epson|epost|enterprises|engineering|engineer|energy|emerck|email|education|edu|" \
@"edeka|eco|eat|earth|dvr|dvag|durban|dupont|duns|dunlop|duck|dubai|dtv|drive|download|dot|doosan|" \
@"domains|doha|dog|dodge|doctor|docs|dnp|diy|dish|discover|discount|directory|direct|digital|diet|" \
@"diamonds|dhl|dev|design|desi|dentist|dental|democrat|delta|deloitte|dell|delivery|degree|deals|" \
@"dealer|deal|dds|dclk|day|datsun|dating|date|data|dance|dad|dabur|cyou|cymru|cuisinella|csc|cruises|" \
@"cruise|crs|crown|cricket|creditunion|creditcard|credit|cpa|courses|coupons|coupon|country|corsica|" \
@"coop|cool|cookingchannel|cooking|contractors|contact|consulting|construction|condos|comsec|computer|" \
@"compare|company|community|commbank|comcast|com|cologne|college|coffee|codes|coach|clubmed|club|cloud|" \
@"clothing|clinique|clinic|click|cleaning|claims|cityeats|city|citic|citi|citadel|cisco|circle|" \
@"cipriani|church|chrysler|chrome|christmas|chloe|chintai|cheap|chat|chase|charity|channel|chanel|cfd|" \
@"cfa|cern|ceo|center|ceb|cbs|cbre|cbn|cba|catholic|catering|cat|casino|cash|caseih|case|casa|cartier|" \
@"cars|careers|career|care|cards|caravan|car|capitalone|capital|capetown|canon|cancerresearch|camp|" \
@"camera|cam|calvinklein|call|cal|cafe|cab|bzh|buzz|buy|business|builders|build|bugatti|budapest|" \
@"brussels|brother|broker|broadway|bridgestone|bradesco|box|boutique|bot|boston|bostik|bosch|boots|" \
@"booking|book|boo|bond|bom|bofa|boehringer|boats|bnpparibas|bnl|bmw|bms|blue|bloomberg|blog|" \
@"blockbuster|blanco|blackfriday|black|biz|bio|bingo|bing|bike|bid|bible|bharti|bet|bestbuy|best|" \
@"berlin|bentley|beer|beauty|beats|bcn|bcg|bbva|bbt|bbc|bayern|bauhaus|basketball|baseball|bargains|" \
@"barefoot|barclays|barclaycard|barcelona|bar|bank|band|bananarepublic|banamex|baidu|baby|azure|axa|" \
@"aws|avianca|autos|auto|author|auspost|audio|audible|audi|auction|attorney|athleta|associates|asia|" \
@"asda|arte|art|arpa|army|archi|aramco|arab|aquarelle|apple|app|apartments|aol|anz|anquan|android|" \
@"analytics|amsterdam|amica|amfam|amex|americanfamily|americanexpress|alstom|alsace|ally|allstate|" \
@"allfinanz|alipay|alibaba|alfaromeo|akdn|airtel|airforce|airbus|aigo|aig|agency|agakhan|africa|afl|" \
@"afamilycompany|aetna|aero|aeg|adult|ads|adac|actor|active|aco|accountants|accountant|accenture|" \
@"academy|abudhabi|abogado|able|abc|abbvie|abbott|abb|abarth|aarp|aaa|onion" \
@")(?=[^a-z0-9@+-]|$))"
#define TWUValidCCTLD \
@"(?:(?:" \
@"한국|香港|澳門|新加坡|台灣|台湾|中國|中国|გე|ລາວ|ไทย|ලංකා|ഭാരതം|ಭಾರತ|భారత్|சிங்கப்பூர்|இலங்கை|இந்தியா|ଭାରତ|ભારત|ਭਾਰਤ|" \
@"ভাৰত|ভারত|বাংলা|भारोत|भारतम्|भारत|ڀارت|پاکستان|موريتانيا|مليسيا|مصر|قطر|فلسطين|عمان|عراق|سورية|سودان|" \
@"تونس|بھارت|بارت|ایران|امارات|المغرب|السعودية|الجزائر|البحرين|الاردن|հայ|қаз|укр|срб|рф|мон|мкд|ею|" \
@"бел|бг|ευ|ελ|zw|zm|za|yt|ye|ws|wf|vu|vn|vi|vg|ve|vc|va|uz|uy|us|um|uk|ug|ua|tz|tw|tv|tt|tr|tp|to|tn|" \
@"tm|tl|tk|tj|th|tg|tf|td|tc|sz|sy|sx|sv|su|st|ss|sr|so|sn|sm|sl|sk|sj|si|sh|sg|se|sd|sc|sb|sa|rw|ru|" \
@"rs|ro|re|qa|py|pw|pt|ps|pr|pn|pm|pl|pk|ph|pg|pf|pe|pa|om|nz|nu|nr|np|no|nl|ni|ng|nf|ne|nc|na|mz|my|" \
@"mx|mw|mv|mu|mt|ms|mr|mq|mp|mo|mn|mm|ml|mk|mh|mg|mf|me|md|mc|ma|ly|lv|lu|lt|ls|lr|lk|li|lc|lb|la|kz|" \
@"ky|kw|kr|kp|kn|km|ki|kh|kg|ke|jp|jo|jm|je|it|is|ir|iq|io|in|im|il|ie|id|hu|ht|hr|hn|hm|hk|gy|gw|gu|" \
@"gt|gs|gr|gq|gp|gn|gm|gl|gi|gh|gg|gf|ge|gd|gb|ga|fr|fo|fm|fk|fj|fi|eu|et|es|er|eh|eg|ee|ec|dz|do|dm|" \
@"dk|dj|de|cz|cy|cx|cw|cv|cu|cr|co|cn|cm|cl|ck|ci|ch|cg|cf|cd|cc|ca|bz|by|bw|bv|bt|bs|br|bq|bo|bn|bm|" \
@"bl|bj|bi|bh|bg|bf|be|bd|bb|ba|az|ax|aw|au|at|as|ar|aq|ao|an|am|al|ai|ag|af|ae|ad|ac" \
@")(?=[^a-z0-9@+-]|$))"
#define TWUValidTCOURL @"^https?://t\\.co/([a-z0-9]+)"
#define TWUValidURLPath \
@"(?:" \
@"(?:" \
TWUValidGeneralURLPathChars @"*" \
@"(?:" TWUValidURLBalancedParens TWUValidGeneralURLPathChars @"*)*" TWUValidURLPathEndingChars \
@")" \
@"|" \
@"(?:" TWUValidGeneralURLPathChars @"+/)" \
@")"
#pragma mark - Constants
// This matches the maximum length of an URL allowed by Twitter's backend.
static const NSInteger kMaxURLLength = 4096;
static const NSInteger kMaxTCOSlugLength = 40;
static const NSInteger kMaxTweetLengthLegacy = 140;
static const NSInteger kTransformedURLLength = 23;
static const NSInteger kPermillageScaleFactor = 1000;
// The backend adds http:// for normal links and https to *.twitter.com URLs
// (it also rewrites http to https for URLs matching *.twitter.com).
// We always add https://. By making the assumption that kURLProtocolLength
// is https, the trade off is we'll disallow a http URL that is 4096 characters.
static const NSInteger kURLProtocolLength = 8; // length of @"https://"
typedef NSInteger (^TextUnitCounterBlock)(NSInteger currentLength, NSString* text, TwitterTextEntity *entity, NSString *substring);
@implementation TwitterText
#pragma mark - Public Methods
+ (NSArray<TwitterTextEntity *> *)entitiesInText:(NSString *)text
{
if (!text.length) {
return @[];
}
NSMutableArray<TwitterTextEntity *> *results = [NSMutableArray<TwitterTextEntity *> array];
NSArray<TwitterTextEntity *> *urls = [self URLsInText:text];
[results addObjectsFromArray:urls];
NSArray<TwitterTextEntity *> *hashtags = [self hashtagsInText:text withURLEntities:urls];
[results addObjectsFromArray:hashtags];
NSArray<TwitterTextEntity *> *symbols = [self symbolsInText:text withURLEntities:urls];
[results addObjectsFromArray:symbols];
NSArray<TwitterTextEntity *> *mentionsAndLists = [self mentionsOrListsInText:text];
NSMutableArray<TwitterTextEntity *> *addingItems = [NSMutableArray<TwitterTextEntity *> array];
for (TwitterTextEntity *entity in mentionsAndLists) {
NSRange entityRange = entity.range;
BOOL found = NO;
for (TwitterTextEntity *existingEntity in results) {
if (NSIntersectionRange(existingEntity.range, entityRange).length > 0) {
found = YES;
break;
}
}
if (!found) {
[addingItems addObject:entity];
}
}
[results addObjectsFromArray:addingItems];
[results sortUsingSelector:@selector(compare:)];
return results;
}
+ (NSArray<TwitterTextEntity *> *)URLsInText:(NSString *)text
{
if (!text.length) {
return @[];
}
NSMutableArray<TwitterTextEntity *> *results = [NSMutableArray<TwitterTextEntity *> array];
NSUInteger len = text.length;
NSUInteger position = 0;
NSRange allRange = NSMakeRange(0, 0);
while (1) {
position = NSMaxRange(allRange);
if (len <= position) {
break;
}
NSTextCheckingResult *urlResult = [[self validURLRegexp] firstMatchInString:text options:NSMatchingWithoutAnchoringBounds range:NSMakeRange(position, len - position)];
if (!urlResult) {
break;
}
allRange = urlResult.range;
if (urlResult.numberOfRanges < 9) {
// Continue processing after the end of this invalid result.
continue;
}
NSRange urlRange = [urlResult rangeAtIndex:TWUValidURLGroupURL];
NSRange precedingRange = [urlResult rangeAtIndex:TWUValidURLGroupPreceding];
NSRange protocolRange = [urlResult rangeAtIndex:TWUValidURLGroupProtocol];
NSRange domainRange = [urlResult rangeAtIndex:TWUValidURLGroupDomain];
NSString *protocol = (protocolRange.location != NSNotFound) ? [text substringWithRange:protocolRange] : nil;
if (protocol.length == 0) {
NSString *preceding = (precedingRange.location != NSNotFound) ? [text substringWithRange:precedingRange] : nil;
NSRange suffixRange = [preceding rangeOfCharacterFromSet:[self invalidURLWithoutProtocolPrecedingCharSet] options:NSBackwardsSearch | NSAnchoredSearch];
if (suffixRange.location != NSNotFound) {
continue;
}
}
NSString *url = (urlRange.location != NSNotFound) ? [text substringWithRange:urlRange] : nil;
NSString *host = (domainRange.location != NSNotFound) ? [text substringWithRange:domainRange] : nil;
NSInteger start = urlRange.location;
NSInteger end = NSMaxRange(urlRange);
NSTextCheckingResult *tcoResult = url ? [[self validTCOURLRegexp] firstMatchInString:url options:0 range:NSMakeRange(0, url.length)] : nil;
if (tcoResult && tcoResult.numberOfRanges >= 2) {
NSRange tcoRange = [tcoResult rangeAtIndex:0];
NSRange tcoUrlSlugRange = [tcoResult rangeAtIndex:1];
if (tcoRange.location == NSNotFound || tcoUrlSlugRange.location == NSNotFound) {
continue;
}
NSString *tcoUrlSlug = [text substringWithRange:tcoUrlSlugRange];
// In the case of t.co URLs, don't allow additional path characters and ensure that the slug is under 40 chars.
if ([tcoUrlSlug length] > kMaxTCOSlugLength) {
continue;
} else {
url = [url substringWithRange:tcoRange];
end = start + url.length;
}
}
if ([self isValidHostAndLength:url.length protocol:protocol host:host]) {
TwitterTextEntity *entity = [TwitterTextEntity entityWithType:TwitterTextEntityURL range:NSMakeRange(start, end - start)];
[results addObject:entity];
allRange = entity.range;
}
}
return results;
}
+ (NSArray<TwitterTextEntity *> *)hashtagsInText:(NSString *)text checkingURLOverlap:(BOOL)checkingURLOverlap
{
if (!text.length) {
return @[];
}
NSArray<TwitterTextEntity *> *urls = nil;
if (checkingURLOverlap) {
urls = [self URLsInText:text];
}
return [self hashtagsInText:text withURLEntities:urls];
}
+ (NSArray<TwitterTextEntity *> *)hashtagsInText:(NSString *)text withURLEntities:(NSArray<TwitterTextEntity *> *)urlEntities
{
if (!text.length) {
return @[];
}
NSMutableArray<TwitterTextEntity *> *results = [NSMutableArray<TwitterTextEntity *> array];
NSUInteger len = text.length;
NSUInteger position = 0;
while (1) {
NSTextCheckingResult *matchResult = [[self validHashtagRegexp] firstMatchInString:text options:NSMatchingWithoutAnchoringBounds range:NSMakeRange(position, len - position)];
if (!matchResult || matchResult.numberOfRanges < 2) {
break;
}
NSRange hashtagRange = [matchResult rangeAtIndex:1];
BOOL matchOk = YES;
// Check URL overlap
for (TwitterTextEntity *urlEntity in urlEntities) {
if (NSIntersectionRange(urlEntity.range, hashtagRange).length > 0) {
matchOk = NO;
break;
}
}
if (matchOk) {
NSUInteger afterStart = NSMaxRange(hashtagRange);
if (afterStart < len) {
NSRange endMatchRange = [[self endHashtagRegexp] rangeOfFirstMatchInString:text options:0 range:NSMakeRange(afterStart, len - afterStart)];
if (endMatchRange.location != NSNotFound) {
matchOk = NO;
}
}
if (matchOk) {
TwitterTextEntity *entity = [TwitterTextEntity entityWithType:TwitterTextEntityHashtag range:hashtagRange];
[results addObject:entity];
}
}
position = NSMaxRange(matchResult.range);
}
return results;
}
+ (NSArray<TwitterTextEntity *> *)symbolsInText:(NSString *)text checkingURLOverlap:(BOOL)checkingURLOverlap
{
if (!text.length) {
return @[];
}
NSArray<TwitterTextEntity *> *urls = nil;
if (checkingURLOverlap) {
urls = [self URLsInText:text];
}
return [self symbolsInText:text withURLEntities:urls];
}
+ (NSArray<TwitterTextEntity *> *)symbolsInText:(NSString *)text withURLEntities:(NSArray<TwitterTextEntity *> *)urlEntities
{
if (!text.length) {
return @[];
}
NSMutableArray<TwitterTextEntity *> *results = [NSMutableArray<TwitterTextEntity *> array];
NSUInteger len = text.length;
NSUInteger position = 0;
while (1) {
NSTextCheckingResult *matchResult = [[self validSymbolRegexp] firstMatchInString:text options:NSMatchingWithoutAnchoringBounds range:NSMakeRange(position, len - position)];
if (!matchResult || matchResult.numberOfRanges < 2) {
break;
}
NSRange symbolRange = [matchResult rangeAtIndex:1];
BOOL matchOk = YES;
// Check URL overlap
for (TwitterTextEntity *urlEntity in urlEntities) {
if (NSIntersectionRange(urlEntity.range, symbolRange).length > 0) {
matchOk = NO;
break;
}
}
if (matchOk) {
TwitterTextEntity *entity = [TwitterTextEntity entityWithType:TwitterTextEntitySymbol range:symbolRange];
[results addObject:entity];
}
position = NSMaxRange(matchResult.range);
}
return results;
}
+ (NSArray<TwitterTextEntity *> *)mentionedScreenNamesInText:(NSString *)text
{
if (!text.length) {
return @[];
}
NSArray<TwitterTextEntity *> *mentionsOrLists = [self mentionsOrListsInText:text];
NSMutableArray<TwitterTextEntity *> *results = [NSMutableArray<TwitterTextEntity *> array];
for (TwitterTextEntity *entity in mentionsOrLists) {
if (entity.type == TwitterTextEntityScreenName) {
[results addObject:entity];
}
}
return results;
}
+ (NSArray<TwitterTextEntity *> *)mentionsOrListsInText:(NSString *)text
{
if (!text.length) {
return @[];
}
NSMutableArray<TwitterTextEntity *> *results = [NSMutableArray<TwitterTextEntity *> array];
NSUInteger len = text.length;
NSUInteger position = 0;
while (1) {
NSTextCheckingResult *matchResult = [[self validMentionOrListRegexp] firstMatchInString:text options:NSMatchingWithoutAnchoringBounds range:NSMakeRange(position, len - position)];
if (!matchResult || matchResult.numberOfRanges < 5) {
break;
}
NSRange allRange = matchResult.range;
NSUInteger end = NSMaxRange(allRange);
NSRange endMentionRange = [[self endMentionRegexp] rangeOfFirstMatchInString:text options:0 range:NSMakeRange(end, len - end)];
if (endMentionRange.location == NSNotFound) {
NSRange atSignRange = [matchResult rangeAtIndex:2];
NSRange screenNameRange = [matchResult rangeAtIndex:3];
NSRange listNameRange = [matchResult rangeAtIndex:4];
if (listNameRange.location == NSNotFound) {
TwitterTextEntity *entity = [TwitterTextEntity entityWithType:TwitterTextEntityScreenName range:NSMakeRange(atSignRange.location, NSMaxRange(screenNameRange) - atSignRange.location)];
[results addObject:entity];
} else {
TwitterTextEntity *entity = [TwitterTextEntity entityWithType:TwitterTextEntityListName range:NSMakeRange(atSignRange.location, NSMaxRange(listNameRange) - atSignRange.location)];
[results addObject:entity];
}
} else {
// Avoid matching the second username in @username@username
end++;
}
position = end;
}
return results;
}
+ (TwitterTextEntity *)repliedScreenNameInText:(NSString *)text
{
if (!text.length) {
return nil;
}
NSUInteger len = text.length;
NSTextCheckingResult *matchResult = [[self validReplyRegexp] firstMatchInString:text options:(NSMatchingWithoutAnchoringBounds | NSMatchingAnchored) range:NSMakeRange(0, len)];
if (!matchResult || matchResult.numberOfRanges < 2) {
return nil;
}
NSRange replyRange = [matchResult rangeAtIndex:1];
NSUInteger replyEnd = NSMaxRange(replyRange);
NSRange endMentionRange = [[self endMentionRegexp] rangeOfFirstMatchInString:text options:0 range:NSMakeRange(replyEnd, len - replyEnd)];
if (endMentionRange.location != NSNotFound) {
return nil;
}
return [TwitterTextEntity entityWithType:TwitterTextEntityScreenName range:replyRange];
}
+ (NSCharacterSet *)validHashtagBoundaryCharacterSet
{
static NSCharacterSet *charset;
static dispatch_once_t onceToken;
dispatch_once(&onceToken, ^{
// Generate equivalent character set matched by TWUHashtagBoundaryInvalidChars regex and invert
NSMutableCharacterSet *set = [NSMutableCharacterSet letterCharacterSet];
[set formUnionWithCharacterSet:[NSCharacterSet decimalDigitCharacterSet]];
[set formUnionWithCharacterSet:[NSCharacterSet characterSetWithCharactersInString: TWHashtagSpecialChars @"&"]];
charset = [set invertedSet];
});
return charset;
}
+ (NSInteger)tweetLength:(NSString *)text
{
return [self tweetLength:text transformedURLLength:kTransformedURLLength];
}
+ (NSInteger)tweetLength:(NSString *)text httpURLLength:(NSInteger)httpURLLength httpsURLLength:(NSInteger)httpsURLLength
{
// Deprecated, here for backwards compatibility. Just uses the httpsURLLength, which has been the same as httpURLLength
// for some time.
return [self tweetLength:text transformedURLLength:httpsURLLength];
}
+ (NSInteger)tweetLength:(NSString *)text transformedURLLength:(NSInteger)transformedURLLength
{
// Use Unicode Normalization Form Canonical Composition to calculate tweet text length
text = [text precomposedStringWithCanonicalMapping];
if (!text.length) {
return 0;
}
// Remove URLs from text and add t.co length
NSMutableString *string = [text mutableCopy];
NSUInteger urlLengthOffset = 0;
NSArray<TwitterTextEntity *> *urlEntities = [self URLsInText:text];
for (NSInteger i = (NSInteger)urlEntities.count - 1; i >= 0; i--) {
TwitterTextEntity *entity = [urlEntities objectAtIndex:(NSUInteger)i];
NSRange urlRange = entity.range;
urlLengthOffset += transformedURLLength;
[string deleteCharactersInRange:urlRange];
}
NSUInteger len = string.length;
NSUInteger charCount = len + urlLengthOffset;
// Adjust count for surrogate pair characters
if (len > 0) {
UniChar buffer[len];
[string getCharacters:buffer range:NSMakeRange(0, len)];
for (NSUInteger i = 0; i < len; i++) {
UniChar c = buffer[i];
if (CFStringIsSurrogateHighCharacter(c)) {
if (i + 1 < len) {
UniChar d = buffer[i + 1];
if (CFStringIsSurrogateLowCharacter(d)) {
charCount--;
i++;
}
}
}
}
}
return (NSInteger)charCount;
}
+ (NSInteger)remainingCharacterCount:(NSString *)text
{
return [self remainingCharacterCount:text transformedURLLength:kTransformedURLLength];
}
+ (NSInteger)remainingCharacterCount:(NSString *)text transformedURLLength:(NSInteger)transformedURLLength
{
return kMaxTweetLengthLegacy - [self tweetLength:text transformedURLLength:transformedURLLength];
}
+ (NSInteger)remainingCharacterCount:(NSString *)text httpURLLength:(NSInteger)httpURLLength httpsURLLength:(NSInteger)httpsURLLength
{
return kMaxTweetLengthLegacy - [self tweetLength:text httpURLLength:httpURLLength httpsURLLength:httpsURLLength];
}
+ (void)eagerlyLoadRegexps
{
static dispatch_once_t onceToken;
dispatch_once(&onceToken, ^{
dispatch_queue_t queue = dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_LOW, 0);
dispatch_async(queue, ^{
@autoreleasepool {
__unused NSRegularExpression *exp = [self validHashtagRegexp];
}
});
dispatch_async(queue, ^{
@autoreleasepool {
__unused NSRegularExpression *exp = [self validURLRegexp];
}
});
dispatch_async(queue, ^{
@autoreleasepool {
__unused NSRegularExpression *exp = [self validGTLDRegexp];
}
});
dispatch_async(queue, ^{
@autoreleasepool {
__unused NSRegularExpression *exp = [self validDomainRegexp];
}
});
dispatch_async(queue, ^{
@autoreleasepool {
__unused NSRegularExpression *exp = [self invalidCharacterRegexp];
}
});
dispatch_async(queue, ^{
@autoreleasepool {
__unused NSRegularExpression *exp = [self validTCOURLRegexp];
}
});
dispatch_async(queue, ^{
@autoreleasepool {
__unused NSRegularExpression *exp = [self endHashtagRegexp];
}
});
dispatch_async(queue, ^{
@autoreleasepool {
__unused NSRegularExpression *exp = [self validSymbolRegexp];
}
});
dispatch_async(queue, ^{
@autoreleasepool {
__unused NSRegularExpression *exp = [self validMentionOrListRegexp];
}
});
dispatch_async(queue, ^{
@autoreleasepool {
__unused NSRegularExpression *exp = [self validReplyRegexp];
}
});
dispatch_async(queue, ^{
@autoreleasepool {
__unused NSRegularExpression *exp = [self endMentionRegexp];
}
});
dispatch_async(queue, ^{
@autoreleasepool {
__unused NSRegularExpression *exp = [self validDomainSucceedingCharRegexp];
}
});
});
}
#pragma mark - Private Methods
+ (NSRegularExpression *)validGTLDRegexp
{
static NSRegularExpression *regexp;
static dispatch_once_t onceToken;
dispatch_once(&onceToken, ^{
regexp = [[NSRegularExpression alloc] initWithPattern:TWUValidGTLD options:NSRegularExpressionCaseInsensitive error:NULL];
});
return regexp;
}
+ (NSRegularExpression *)validURLRegexp
{
static NSRegularExpression *regexp;
static dispatch_once_t onceToken;
dispatch_once(&onceToken, ^{
regexp = [[NSRegularExpression alloc] initWithPattern:TWUValidURLPatternString options:NSRegularExpressionCaseInsensitive error:NULL];
});
return regexp;
}
+ (NSRegularExpression *)validDomainRegexp
{
static NSRegularExpression *regexp;
static dispatch_once_t onceToken;
dispatch_once(&onceToken, ^{
regexp = [[NSRegularExpression alloc] initWithPattern:TWUValidDomain options:NSRegularExpressionCaseInsensitive error:NULL];
});
return regexp;
}
+ (NSRegularExpression *)invalidCharacterRegexp
{
static NSRegularExpression *regexp;
static dispatch_once_t onceToken;
dispatch_once(&onceToken, ^{
regexp = [[NSRegularExpression alloc] initWithPattern:TWUInvalidCharactersPattern options:NSRegularExpressionCaseInsensitive error:NULL];
});
return regexp;
}
+ (NSRegularExpression *)validTCOURLRegexp
{
static NSRegularExpression *regexp;
static dispatch_once_t onceToken;
dispatch_once(&onceToken, ^{
regexp = [[NSRegularExpression alloc] initWithPattern:TWUValidTCOURL options:NSRegularExpressionCaseInsensitive error:NULL];
});
return regexp;
}
+ (NSRegularExpression *)validHashtagRegexp
{
static NSRegularExpression *regexp;
static dispatch_once_t onceToken;
dispatch_once(&onceToken, ^{
regexp = [[NSRegularExpression alloc] initWithPattern:TWUValidHashtag options:NSRegularExpressionCaseInsensitive error:NULL];
});
return regexp;
}
+ (NSRegularExpression *)endHashtagRegexp
{
static NSRegularExpression *regexp;
static dispatch_once_t onceToken;
dispatch_once(&onceToken, ^{
regexp = [[NSRegularExpression alloc] initWithPattern:TWUEndHashTagMatch options:NSRegularExpressionCaseInsensitive error:NULL];
});
return regexp;
}
+ (NSRegularExpression *)validSymbolRegexp
{
static NSRegularExpression *regexp;
static dispatch_once_t onceToken;
dispatch_once(&onceToken, ^{
regexp = [[NSRegularExpression alloc] initWithPattern:TWUValidSymbol options:NSRegularExpressionCaseInsensitive error:NULL];
});
return regexp;
}
+ (NSRegularExpression *)validMentionOrListRegexp
{
static NSRegularExpression *regexp;
static dispatch_once_t onceToken;
dispatch_once(&onceToken, ^{
regexp = [[NSRegularExpression alloc] initWithPattern:TWUValidMentionOrList options:NSRegularExpressionCaseInsensitive error:NULL];
});
return regexp;
}
+ (NSRegularExpression *)validReplyRegexp
{
static NSRegularExpression *regexp;
static dispatch_once_t onceToken;
dispatch_once(&onceToken, ^{
regexp = [[NSRegularExpression alloc] initWithPattern:TWUValidReply options:NSRegularExpressionCaseInsensitive error:NULL];
});
return regexp;
}
+ (NSRegularExpression *)endMentionRegexp
{
static NSRegularExpression *regexp;
static dispatch_once_t onceToken;
dispatch_once(&onceToken, ^{
regexp = [[NSRegularExpression alloc] initWithPattern:TWUEndMentionMatch options:NSRegularExpressionCaseInsensitive error:NULL];
});
return regexp;
}
+ (NSCharacterSet *)invalidURLWithoutProtocolPrecedingCharSet
{
static NSCharacterSet *charset;
static dispatch_once_t onceToken;
dispatch_once(&onceToken, ^{
charset = [NSCharacterSet characterSetWithCharactersInString:@"-_./"];
});
return charset;
}
+ (NSRegularExpression *)validDomainSucceedingCharRegexp
{
static NSRegularExpression *regexp;
static dispatch_once_t onceToken;
dispatch_once(&onceToken, ^{
regexp = [[NSRegularExpression alloc] initWithPattern:TWUEndMentionMatch options:NSRegularExpressionCaseInsensitive error:NULL];
});
return regexp;
}
+ (BOOL)isValidHostAndLength:(NSUInteger)urlLength protocol:(NSString *)protocol host:(NSString *)host
{
if (!host) {
return NO;
}
NSError *error;
NSInteger originalHostLength = [host length];
NSURL *url = [NSURL URLWithUnicodeString:host error:&error];
if (error) {
if (error.code == IFUnicodeURLConvertErrorInvalidDNSLength) {
// If the error is specifically IFUnicodeURLConvertErrorInvalidDNSLength,
// just return a false result. NSURL will happily create a URL for a host
// with labels > 63 characters (radar 35802213).
return NO;
} else {
// Attempt to create a NSURL object. We may have received an error from
// URLWithUnicodeString above because the input is not valid for punycode
// conversion (example: non-LDH characters are invalid and will trigger
// an error with code == IFUnicodeURLConvertErrorSTD3NonLDH but may be
// allowed normally per RFC 1035.
url = [NSURL URLWithString:host];
}
}
if (!url) {
return NO;
}
// Should be encoded if necessary.
host = url.absoluteString;
NSInteger updatedHostLength = [host length];
if (updatedHostLength == 0) {
return NO;
} else if (updatedHostLength > originalHostLength) {
urlLength += (updatedHostLength - originalHostLength);
}
// Because the backend always adds https:// if we're missing a protocol, add this length
// back in when checking vs. our maximum allowed length of a URL, if necessary.
NSInteger urlLengthWithProtocol = urlLength;
if (!protocol) {
urlLengthWithProtocol += kURLProtocolLength;
}
return urlLengthWithProtocol <= kMaxURLLength;
}
@end
NSString * const kTwitterTextParserConfigurationClassic = @"v1";
NSString * const kTwitterTextParserConfigurationV2 = @"v2";
NSString * const kTwitterTextParserConfigurationV3 = @"v3";
static TwitterTextParser *sDefaultParser;
@implementation TwitterTextParser
+ (dispatch_queue_t)_queue
{
static dispatch_queue_t sQueue;
static dispatch_once_t onceToken;
dispatch_once(&onceToken, ^{
sQueue = dispatch_queue_create("twitterText", DISPATCH_QUEUE_SERIAL);
});
return sQueue;
}
- (instancetype)initWithConfiguration:(TwitterTextConfiguration *)configuration
{
if (self = [super init]) {
_configuration = configuration;
}
return self;
}
+ (instancetype)defaultParser
{
dispatch_sync([self _queue], ^{
@autoreleasepool {
if (!sDefaultParser) {
TwitterTextConfiguration *configuration = [TwitterTextConfiguration configurationFromJSONResource:kTwitterTextParserConfigurationV3];
sDefaultParser = [[TwitterTextParser alloc] initWithConfiguration:configuration];
}
}
});
return sDefaultParser;
}
+ (void)setDefaultParserWithConfiguration:(TwitterTextConfiguration *)configuration
{
dispatch_async([self _queue], ^{
@autoreleasepool {
sDefaultParser = [[TwitterTextParser alloc] initWithConfiguration:configuration];
}
});
}
- (NSInteger)maxWeightedTweetLength
{
return _configuration.maxWeightedTweetLength;
}
- (TwitterTextParseResults *)parseTweet:(NSString *)text
{
// Use Unicode Normalization Form Canonical Composition
NSString *normalizedText;
NSUInteger normalizedTextLength;
if (text.length != 0) {
normalizedText = [text precomposedStringWithCanonicalMapping];
normalizedTextLength = normalizedText.length;
} else {
normalizedTextLength = 0;
}
if (normalizedTextLength == 0) {
NSRange rangeZero = NSMakeRange(0, 0);
return [[TwitterTextParseResults alloc] initWithWeightedLength:0 permillage:0 valid:YES displayRange:rangeZero validRange:rangeZero];
}
const NSRange rangeNotFound = NSMakeRange(NSNotFound, NSNotFound);
// Build an map of ranges, assuming the original character count does not change after normalization
const NSUInteger textLength = text.length;
NSRange textRanges[textLength], *ptr = textRanges;
for (NSUInteger i = 0; i < textLength; i++) {
textRanges[i] = rangeNotFound;
}
[self _tt_lengthOfText:text range:NSMakeRange(0, text.length) countingBlock:^NSInteger(NSInteger index, NSString *blockText, TwitterTextEntity *entity, NSString *substring) {
// entity.range.length can be > 1 for emoji, decomposed characters, etc.
for (NSInteger i = 0; i < entity.range.length; i++) {
if (index+i < textLength) {
ptr[index+i] = entity.range;
} else {
NSAssert(NO, @"index+i (%ld+%ld) greater than text.length (%lu) for text \"%@\"", (long)index, (long)i, (unsigned long)textLength, text); // casts will be unnecessary when TwitterText is no longer built for 32-bit targets
}
}
return index + entity.range.length;
}];
NSRange normalizedRanges[normalizedTextLength], *normalizedRangesPtr = normalizedRanges;
for (NSUInteger i = 0; i < normalizedTextLength; i++) {
normalizedRangesPtr[i] = rangeNotFound;
}
__block NSInteger offset = 0;
[self _tt_lengthOfText:normalizedText range:NSMakeRange(0, normalizedTextLength) countingBlock:^NSInteger(NSInteger composedCharIndex, NSString *blockText, TwitterTextEntity *entity, NSString *substring) {
// map index of each composed char back to its pre-normalized index.
if (composedCharIndex+offset < textLength) {
NSRange originalRange = ptr[composedCharIndex+offset];
for (NSInteger i = 0; i < entity.range.length; i++) {
normalizedRangesPtr[composedCharIndex+i] = originalRange;
}
if (originalRange.length > entity.range.length) {
offset += (originalRange.length - entity.range.length);
}
} else {
NSAssert(NO, @"composedCharIndex+offset (%ld+%ld) greater than text.length (%lu) for text \"%@\"", (long)composedCharIndex, (long)offset, (unsigned long)textLength, text); // casts will be unnecessary when TwitterText is no longer built for 32-bit targets
}
return composedCharIndex + entity.range.length;
}];
NSArray<TwitterTextEntity *> *urlEntities = [TwitterText URLsInText:normalizedText];
__block BOOL isValid = YES;
__block NSInteger weightedLength = 0;
__block NSInteger validStartIndex = NSNotFound, validEndIndex = NSNotFound;
__block NSInteger displayStartIndex = NSNotFound, displayEndIndex = NSNotFound;
TextUnitCounterBlock textUnitCountingBlock = ^NSInteger(NSInteger previousLength, NSString *blockText, TwitterTextEntity *entity, NSString *substring) {
NSRange range = entity.range;
NSInteger updatedLength = previousLength;
switch (entity.type) {
case TwitterTextEntityURL:
updatedLength = previousLength + (self->_configuration.transformedURLLength * self->_configuration.scale);
break;
case TwitterTextEntityTweetEmojiChar:
updatedLength = previousLength + self.configuration.defaultWeight;
break;
case TwitterTextEntityTweetChar:
updatedLength = previousLength + [self _tt_lengthOfWeightedChar:substring];
break;
case TwitterTextEntityScreenName:
case TwitterTextEntityHashtag:
case TwitterTextEntityListName:
case TwitterTextEntitySymbol:
// Do nothing for these entity types.
break;
}
if (validStartIndex == NSNotFound) {
validStartIndex = range.location;
}
if (displayStartIndex == NSNotFound) {
displayStartIndex = range.location;
}
if (range.length > 0) {
displayEndIndex = NSMaxRange(range) - 1;
}
if (range.location + range.length <= blockText.length) {
NSTextCheckingResult *invalidResult = [[TwitterText invalidCharacterRegexp] firstMatchInString:blockText options:0 range:range];
if (invalidResult) {
isValid = NO;
} else if (isValid && (updatedLength + weightedLength <= self.maxWeightedTweetLength * self->_configuration.scale)) {
validEndIndex = (range.length > 0) ? NSMaxRange(range) - 1 : range.location;
} else {
isValid = NO;
}
} else {
NSAssert(NO, @"range (%@) outside bounds of blockText.length (%lu) for blockText \"%@\"", NSStringFromRange(range), (unsigned long)blockText.length, blockText);
isValid = NO;
}
return updatedLength;
};
NSInteger textIndex = 0;
for (TwitterTextEntity *urlEntity in urlEntities) {
if (textIndex < urlEntity.range.location) {
weightedLength += [self _tt_lengthOfText:normalizedText range:NSMakeRange(textIndex, urlEntity.range.location - textIndex) countingBlock:textUnitCountingBlock];
}
weightedLength += textUnitCountingBlock(0, normalizedText, urlEntity, [normalizedText substringWithRange:urlEntity.range]);
textIndex = urlEntity.range.location + urlEntity.range.length;
}
// handle trailing text
weightedLength += [self _tt_lengthOfText:normalizedText range:NSMakeRange(textIndex, normalizedTextLength - textIndex) countingBlock:textUnitCountingBlock];
NSAssert(!NSEqualRanges(normalizedRanges[displayStartIndex], rangeNotFound), @"displayStartIndex should map to existing index in original string");
NSAssert(!NSEqualRanges(normalizedRanges[displayEndIndex], rangeNotFound), @"displayEndIndex should map to existing index in original string");
NSAssert(!NSEqualRanges(normalizedRanges[validStartIndex], rangeNotFound), @"validStartIndex should map to existing index in original string");
NSAssert(!NSEqualRanges(normalizedRanges[validEndIndex], rangeNotFound), @"validEndIndex should map to existing index in original string");
if (displayStartIndex == NSNotFound) {
displayStartIndex = 0;
}
if (displayEndIndex == NSNotFound) {
displayEndIndex = 0;
}
if (validStartIndex == NSNotFound) {
validStartIndex = 0;
}
if (validEndIndex == NSNotFound) {
validEndIndex = 0;
}
NSRange displayRange = NSMakeRange(normalizedRanges[displayStartIndex].location, NSMaxRange(normalizedRanges[displayEndIndex]) - normalizedRanges[displayStartIndex].location);
NSRange validRange = NSMakeRange(normalizedRanges[validStartIndex].location, NSMaxRange(normalizedRanges[validEndIndex]) - normalizedRanges[validStartIndex].location);
NSInteger scaledWeightedLength = weightedLength / _configuration.scale;
NSInteger permillage = (NSInteger)(kPermillageScaleFactor * (scaledWeightedLength / (float)[self maxWeightedTweetLength]));
return [[TwitterTextParseResults alloc] initWithWeightedLength:scaledWeightedLength permillage:permillage valid:isValid displayRange:displayRange validRange:validRange];
}
#pragma mark -- Private methods
- (NSInteger)_tt_lengthOfText:(NSString *)text range:(NSRange)range countingBlock:(nonnull TextUnitCounterBlock)countingBlock
{
__block NSInteger length = 0;
NSMutableArray *emojiRanges = [[NSMutableArray alloc] init];
if (self.configuration.isEmojiParsingEnabled) {
// With emoji parsing enabled, we first find all emoji in the input text (so that we only
// have to match vs. the complex emoji regex once).
NSArray<NSTextCheckingResult *> *emojiMatches = [TwitterTextEmojiRegex() matchesInString:text options:0 range:NSMakeRange(0, text.length)];
for (NSTextCheckingResult *match in emojiMatches) {
[emojiRanges addObject:[NSValue valueWithRange:match.range]];
}
}
if (range.location + range.length <= text.length) {
// TODO: drop-iOS-10: when dropping support for iOS 10, remove the #if, #endif and everything in between
#if __IPHONE_11_0 > __IPHONE_OS_VERSION_MIN_REQUIRED
#if 0
// Unicode 10.0 isn't fully supported on iOS 10.
// e.g. on iOS 10, closure block arg of [NSString enumerateSubstringsInRange:options:usingBlock:]
// is called an "incorrect" number of times for some Unicode10 composed character sequences
// i.e. calling enumerateSubstringsInRange:options:usingBlock: on the string
@"🤪; 🧕; 🧕🏾; 🏴"
// results in the following values of `substringRange` and `substring` within the block
// and of the __block var `length` once the block is complete:
// iOS 11 and above
substringRange = @"{1, 2}" , substring = @"🤪"
substringRange = @"{3, 1}" , substring = @";"
substringRange = @"{4, 1}" , substring = @" "
substringRange = @"{5, 2}" , substring = @"🧕"
substringRange = @"{7, 1}" , substring = @";"
substringRange = @"{8, 1}" , substring = @" "
substringRange = @"{9, 4}" , substring = @"🧕🏾"
substringRange = @"{13, 1}" , substring = @";"
substringRange = @"{14, 1}" , substring = @" "
substringRange = @"{15, 14}" , substring = @"🏴"
length = 15
// iOS 10
substringRange = @"{1, 2}" , substring = @"🤪"
substringRange = @"{3, 1}" , substring = @";"
substringRange = @"{4, 1}" , substring = @" "
substringRange = @"{5, 2}" , substring = @"🧕"
substringRange = @"{7, 1}" , substring = @";"
substringRange = @"{8, 1}" , substring = @" "
substringRange = @"{9, 2}" , substring = @"🧕"
substringRange = @"{11, 2}" , substring = @"🏾"
substringRange = @"{13, 1}" , substring = @";"
substringRange = @"{14, 1}" , substring = @" "
substringRange = @"{15, 2}" , substring = @"🏴"
substringRange = @"{17, 2}" , substring = @""
substringRange = @"{19, 2}" , substring = @""
substringRange = @"{21, 2}" , substring = @""
substringRange = @"{23, 2}" , substring = @""
substringRange = @"{25, 2}" , substring = @""
substringRange = @"{27, 2}" , substring = @""
length = 29
#endif // #if 0
#endif // #if __IPHONE_11_0 > __IPHONE_OS_VERSION_MIN_REQUIRED
[text enumerateSubstringsInRange:range options:NSStringEnumerationByComposedCharacterSequences usingBlock:^(NSString *substring, NSRange substringRange, NSRange enclosingRange, BOOL *stop) {
if (countingBlock != NULL) {
TwitterTextEntityType type = (self.configuration.isEmojiParsingEnabled && [emojiRanges containsObject:[NSValue valueWithRange:substringRange]]) ? TwitterTextEntityTweetEmojiChar : TwitterTextEntityTweetChar;
length = countingBlock(length, text, [TwitterTextEntity entityWithType:type range:substringRange], substring);
}
}];
} else {
NSAssert(NO, @"range (%@) outside bounds of text.length (%lu) for text \"%@\"", NSStringFromRange(range), (unsigned long)text.length, text);
length = text.length;
}
return length;
}
- (NSInteger)_tt_lengthOfWeightedChar:(NSString *)text
{
NSInteger length = text.length;
if (length == 0) {
return 0;
}
UniChar buffer[length];
[text getCharacters:buffer range:NSMakeRange(0, length)];
NSInteger weightedLength = 0;
NSInteger codepointCount = 0;
UniChar *ptr = buffer;
for (NSUInteger i = 0; i < length; i++) {
__block NSInteger charWeight = _configuration.defaultWeight;
BOOL isSurrogatePair = (i + 1 < length && CFStringIsSurrogateHighCharacter(ptr[i]) && CFStringIsSurrogateLowCharacter(ptr[i+1]));
for (TwitterTextWeightedRange *weightedRange in _configuration.ranges) {
NSInteger begin = weightedRange.range.location;
NSInteger end = weightedRange.range.location + weightedRange.range.length;
if (isSurrogatePair) {
UTF32Char char32 = CFStringGetLongCharacterForSurrogatePair(ptr[i], ptr[i+1]);
if (char32 >= begin && char32 <= end) {
charWeight = weightedRange.weight;
break;
}
} else if (ptr[i] >= begin && ptr[i] <= end) {
charWeight = weightedRange.weight;
break;
}
}
// skip the next char of the surrogate pair.
if (isSurrogatePair) {
i++;
}
codepointCount++;
weightedLength += charWeight;
}
return weightedLength;
}
@end
@implementation TwitterTextWeightedRange
- (instancetype)initWithRange:(NSRange)range weight:(NSInteger)weight
{
self = [super init];
if (self) {
_range = range;
_weight = weight;
}
return self;
}
@end
@implementation TwitterTextParseResults
- (instancetype)initWithWeightedLength:(NSInteger)length permillage:(NSInteger)permillage valid:(BOOL)valid displayRange:(NSRange)displayRange validRange:(NSRange)validRange
{
self = [super init];
if (self) {
_weightedLength = length;
_permillage = permillage;
_isValid = valid;
_displayTextRange = displayRange;
_validDisplayTextRange = validRange;
}
return self;
}
- (NSString *)description
{
return [NSString stringWithFormat:@"weightedLength: %ld, permillage: %ld, isValid: %d, displayTextRange: %@, validDisplayTextRange: %@", (long)_weightedLength, (long)_permillage, _isValid, NSStringFromRange(_displayTextRange), NSStringFromRange(_validDisplayTextRange)]; // TODO: when no longer supporting 32-bit devices, remove (long) casts
}
@end
@implementation TwitterTextConfiguration
- (instancetype)initWithJSONString:(NSString *)jsonString
{
self = [super init];
if (self) {
NSError *jsonError = nil;
NSData *jsonData = [jsonString dataUsingEncoding:NSUTF8StringEncoding];
NSDictionary *jsonDictionary = [NSJSONSerialization JSONObjectWithData:jsonData options:NSJSONReadingMutableContainers error:&jsonError];
_version = [jsonDictionary[@"version"] integerValue];
_maxWeightedTweetLength = [jsonDictionary[@"maxWeightedTweetLength"] integerValue];
_scale = [jsonDictionary[@"scale"] integerValue];
_defaultWeight = [jsonDictionary[@"defaultWeight"] integerValue];
_transformedURLLength = [jsonDictionary[@"transformedURLLength"] integerValue];
_emojiParsingEnabled = [jsonDictionary[@"emojiParsingEnabled"] boolValue];
NSArray *jsonRanges = jsonDictionary[@"ranges"];
NSMutableArray *ranges = [NSMutableArray arrayWithCapacity:jsonRanges.count];
for (NSDictionary *rangeDict in jsonRanges) {
NSRange range;
range.location = [rangeDict[@"start"] integerValue];
range.length = [rangeDict[@"end"] integerValue] - range.location;
NSInteger charWeight = [rangeDict[@"weight"] integerValue];
TwitterTextWeightedRange *charWeightObject = [[TwitterTextWeightedRange alloc] initWithRange:range weight:charWeight];
[ranges addObject:charWeightObject];
}
_ranges = [ranges copy];
}
return self;
}
+ (instancetype)configurationFromJSONResource:(NSString *)jsonResource
{
NSError *error = nil;
NSString *sourceFile = [[NSBundle bundleForClass:self] pathForResource:jsonResource ofType:@"json"];
NSString *jsonString = [NSString stringWithContentsOfFile:sourceFile encoding:NSUTF8StringEncoding error:&error];
return !error ? [self configurationFromJSONString:jsonString] : nil;
}
+ (instancetype)configurationFromJSONString:(NSString *)jsonString
{
return [[TwitterTextConfiguration alloc] initWithJSONString:jsonString];
}
@end