extract_urls_with_indices

in rb/lib/twitter-text/extractor.rb [205:258]


      def extract_urls_with_indices(text, options = {:extract_url_without_protocol => true}) 
        return [] unless text && (options[:extract_url_without_protocol] ? text.index(".") : text.index(":"))
        urls = []

        text.to_s.scan(Twitter::TwitterText::Regex[:valid_url]) do |all, before, url, protocol, domain, port, path, query|
          valid_url_match_data = $~

                                  start_position = valid_url_match_data.char_begin(3)
          end_position = valid_url_match_data.char_end(3)

          
          
          if !protocol
            next if !options[:extract_url_without_protocol] || before =~ Twitter::TwitterText::Regex[:invalid_url_without_protocol_preceding_chars]
            last_url = nil
            domain.scan(Twitter::TwitterText::Regex[:valid_ascii_domain]) do |ascii_domain|
              next unless is_valid_domain(url.length, ascii_domain, protocol)
              last_url = {
                :url => ascii_domain,
                :indices => [start_position + $~.char_begin(0),
                             start_position + $~.char_end(0)]
              }
              urls << last_url
            end

            
            next unless last_url

            
            if path
              
              last_url[:url] = url.sub(domain, last_url[:url])
              last_url[:indices][1] = end_position
            end
          else
            
            if url =~ Twitter::TwitterText::Regex[:valid_tco_url]
              next if $1 && $1.length > MAX_TCO_SLUG_LENGTH
              url = $&
                    end_position = start_position + url.codepoint_length
            end

            next unless is_valid_domain(url.length, domain, protocol)

            urls << {
              :url => url,
              :indices => [start_position, end_position]
            }
          end
        end
        urls.each{|url| yield url[:url], url[:indices].first, url[:indices].last} if block_given?
        urls
      end