Enable to recognize most kinds of characters as URL paths (#4941)

This commit is contained in:
ふぁぼ原 2017-09-15 01:03:20 +09:00 committed by Eugen Rochko
parent b39d512ade
commit 3816943e6b
5 changed files with 96 additions and 5 deletions

View file

@ -131,7 +131,7 @@ class Formatter
end
def link_html(url)
url = Addressable::URI.parse(url).display_uri.to_s
url = Addressable::URI.parse(url).to_s
prefix = url.match(/\Ahttps?:\/\/(www\.)?/).to_s
text = url[prefix.length, 30]
suffix = url[prefix.length + 30..-1]

View file

@ -1,9 +1,15 @@
# frozen_string_literal: true
class FetchLinkCardService < BaseService
include ActionView::Helpers::TagHelper
URL_PATTERN = %r{https?://\S+}
URL_PATTERN = %r{
( # $1 URL
(https?:\/\/)? # $2 Protocol (optional)
(#{Twitter::Regex[:valid_domain]}) # $3 Domain(s)
(?::(#{Twitter::Regex[:valid_port_number]}))? # $4 Port number (optional)
(/#{Twitter::Regex[:valid_url_path]}*)? # $5 URL Path and anchor
(\?#{Twitter::Regex[:valid_url_query_chars]}*#{Twitter::Regex[:valid_url_query_ending_chars]})? # $6 Query String
)
}iox
def call(status)
@status = status
@ -42,7 +48,7 @@ class FetchLinkCardService < BaseService
def parse_urls
if @status.local?
urls = @status.text.match(URL_PATTERN).to_a.map { |uri| Addressable::URI.parse(uri).normalize }
urls = @status.text.scan(URL_PATTERN).map { |array| Addressable::URI.parse(array[0]).normalize }
else
html = Nokogiri::HTML(@status.text)
links = html.css('a')

View file

@ -0,0 +1,42 @@
module Twitter
class Regex
REGEXEN[:valid_general_url_path_chars] = /[^\p{White_Space}\(\)\?]/iou
REGEXEN[:valid_url_path_ending_chars] = /[^\p{White_Space}\(\)\?!\*';:=\,\.\$%\[\]\p{Pd}_~&\|@]|(?:#{REGEXEN[:valid_url_balanced_parens]})/iou
REGEXEN[:valid_url_balanced_parens] = /
\(
(?:
#{REGEXEN[:valid_general_url_path_chars]}+
|
# allow one nested level of balanced parentheses
(?:
#{REGEXEN[:valid_general_url_path_chars]}*
\(
#{REGEXEN[:valid_general_url_path_chars]}+
\)
#{REGEXEN[:valid_general_url_path_chars]}*
)
)
\)
/iox
REGEXEN[:valid_url_path] = /(?:
(?:
#{REGEXEN[:valid_general_url_path_chars]}*
(?:#{REGEXEN[:valid_url_balanced_parens]} #{REGEXEN[:valid_general_url_path_chars]}*)*
#{REGEXEN[:valid_url_path_ending_chars]}
)|(?:#{REGEXEN[:valid_general_url_path_chars]}+\/)
)/iox
REGEXEN[:valid_url] = %r{
( # $1 total match
(#{REGEXEN[:valid_url_preceding_chars]}) # $2 Preceeding chracter
( # $3 URL
(https?:\/\/)? # $4 Protocol (optional)
(#{REGEXEN[:valid_domain]}) # $5 Domain(s)
(?::(#{REGEXEN[:valid_port_number]}))? # $6 Port number (optional)
(/#{REGEXEN[:valid_url_path]}*)? # $7 URL Path and anchor
(\?#{REGEXEN[:valid_url_query_chars]}*#{REGEXEN[:valid_url_query_ending_chars]})? # $8 Query String
)
)
}iox
end
end

View file

@ -89,6 +89,38 @@ RSpec.describe Formatter do
end
end
context 'matches a URL with Japanese path string' do
let(:text) { 'https://ja.wikipedia.org/wiki/日本' }
it 'has valid URL' do
is_expected.to include 'href="https://ja.wikipedia.org/wiki/%E6%97%A5%E6%9C%AC"'
end
end
context 'matches a URL with Korean path string' do
let(:text) { 'https://ko.wikipedia.org/wiki/대한민국' }
it 'has valid URL' do
is_expected.to include 'href="https://ko.wikipedia.org/wiki/%EB%8C%80%ED%95%9C%EB%AF%BC%EA%B5%AD"'
end
end
context 'matches a URL with Simplified Chinese path string' do
let(:text) { 'https://baike.baidu.com/item/中华人民共和国' }
it 'has valid URL' do
is_expected.to include 'href="https://baike.baidu.com/item/%E4%B8%AD%E5%8D%8E%E4%BA%BA%E6%B0%91%E5%85%B1%E5%92%8C%E5%9B%BD"'
end
end
context 'matches a URL with Traditional Chinese path string' do
let(:text) { 'https://zh.wikipedia.org/wiki/臺灣' }
it 'has valid URL' do
is_expected.to include 'href="https://zh.wikipedia.org/wiki/%E8%87%BA%E7%81%A3"'
end
end
context 'contains HTML (script tag)' do
let(:text) { '<script>alert("Hello")</script>' }

View file

@ -12,6 +12,8 @@ RSpec.describe FetchLinkCardService do
stub_request(:get, 'http://example.com/sjis_with_wrong_charset').to_return(request_fixture('sjis_with_wrong_charset.txt'))
stub_request(:head, 'http://example.com/koi8-r').to_return(status: 200, headers: { 'Content-Type' => 'text/html' })
stub_request(:get, 'http://example.com/koi8-r').to_return(request_fixture('koi8-r.txt'))
stub_request(:head, 'http://example.com/日本語').to_return(status: 200, headers: { 'Content-Type' => 'text/html' })
stub_request(:get, 'http://example.com/日本語').to_return(request_fixture('sjis.txt'))
stub_request(:head, 'https://github.com/qbi/WannaCry').to_return(status: 404)
subject.call(status)
@ -52,6 +54,15 @@ RSpec.describe FetchLinkCardService do
expect(status.preview_cards.first.title).to eq("Московя начинаетъ только въ XVI ст. привлекать внимане иностранцевъ.")
end
end
context do
let(:status) { Fabricate(:status, text: 'テストhttp://example.com/日本語') }
it 'works with Japanese path string' do
expect(a_request(:get, 'http://example.com/日本語')).to have_been_made.at_least_once
expect(status.preview_cards.first.title).to eq("SJISのページ")
end
end
end
context 'in a remote status' do