Enable to recognize most kinds of characters as URL paths (#4941)
This commit is contained in:
parent
b39d512ade
commit
3816943e6b
5 changed files with 96 additions and 5 deletions
|
@ -131,7 +131,7 @@ class Formatter
|
|||
end
|
||||
|
||||
def link_html(url)
|
||||
url = Addressable::URI.parse(url).display_uri.to_s
|
||||
url = Addressable::URI.parse(url).to_s
|
||||
prefix = url.match(/\Ahttps?:\/\/(www\.)?/).to_s
|
||||
text = url[prefix.length, 30]
|
||||
suffix = url[prefix.length + 30..-1]
|
||||
|
|
|
@ -1,9 +1,15 @@
|
|||
# frozen_string_literal: true
|
||||
|
||||
class FetchLinkCardService < BaseService
|
||||
include ActionView::Helpers::TagHelper
|
||||
|
||||
URL_PATTERN = %r{https?://\S+}
|
||||
URL_PATTERN = %r{
|
||||
( # $1 URL
|
||||
(https?:\/\/)? # $2 Protocol (optional)
|
||||
(#{Twitter::Regex[:valid_domain]}) # $3 Domain(s)
|
||||
(?::(#{Twitter::Regex[:valid_port_number]}))? # $4 Port number (optional)
|
||||
(/#{Twitter::Regex[:valid_url_path]}*)? # $5 URL Path and anchor
|
||||
(\?#{Twitter::Regex[:valid_url_query_chars]}*#{Twitter::Regex[:valid_url_query_ending_chars]})? # $6 Query String
|
||||
)
|
||||
}iox
|
||||
|
||||
def call(status)
|
||||
@status = status
|
||||
|
@ -42,7 +48,7 @@ class FetchLinkCardService < BaseService
|
|||
|
||||
def parse_urls
|
||||
if @status.local?
|
||||
urls = @status.text.match(URL_PATTERN).to_a.map { |uri| Addressable::URI.parse(uri).normalize }
|
||||
urls = @status.text.scan(URL_PATTERN).map { |array| Addressable::URI.parse(array[0]).normalize }
|
||||
else
|
||||
html = Nokogiri::HTML(@status.text)
|
||||
links = html.css('a')
|
||||
|
|
42
config/initializers/twitter_regex.rb
Normal file
42
config/initializers/twitter_regex.rb
Normal file
|
@ -0,0 +1,42 @@
|
|||
module Twitter
|
||||
class Regex
|
||||
|
||||
REGEXEN[:valid_general_url_path_chars] = /[^\p{White_Space}\(\)\?]/iou
|
||||
REGEXEN[:valid_url_path_ending_chars] = /[^\p{White_Space}\(\)\?!\*';:=\,\.\$%\[\]\p{Pd}_~&\|@]|(?:#{REGEXEN[:valid_url_balanced_parens]})/iou
|
||||
REGEXEN[:valid_url_balanced_parens] = /
|
||||
\(
|
||||
(?:
|
||||
#{REGEXEN[:valid_general_url_path_chars]}+
|
||||
|
|
||||
# allow one nested level of balanced parentheses
|
||||
(?:
|
||||
#{REGEXEN[:valid_general_url_path_chars]}*
|
||||
\(
|
||||
#{REGEXEN[:valid_general_url_path_chars]}+
|
||||
\)
|
||||
#{REGEXEN[:valid_general_url_path_chars]}*
|
||||
)
|
||||
)
|
||||
\)
|
||||
/iox
|
||||
REGEXEN[:valid_url_path] = /(?:
|
||||
(?:
|
||||
#{REGEXEN[:valid_general_url_path_chars]}*
|
||||
(?:#{REGEXEN[:valid_url_balanced_parens]} #{REGEXEN[:valid_general_url_path_chars]}*)*
|
||||
#{REGEXEN[:valid_url_path_ending_chars]}
|
||||
)|(?:#{REGEXEN[:valid_general_url_path_chars]}+\/)
|
||||
)/iox
|
||||
REGEXEN[:valid_url] = %r{
|
||||
( # $1 total match
|
||||
(#{REGEXEN[:valid_url_preceding_chars]}) # $2 Preceeding chracter
|
||||
( # $3 URL
|
||||
(https?:\/\/)? # $4 Protocol (optional)
|
||||
(#{REGEXEN[:valid_domain]}) # $5 Domain(s)
|
||||
(?::(#{REGEXEN[:valid_port_number]}))? # $6 Port number (optional)
|
||||
(/#{REGEXEN[:valid_url_path]}*)? # $7 URL Path and anchor
|
||||
(\?#{REGEXEN[:valid_url_query_chars]}*#{REGEXEN[:valid_url_query_ending_chars]})? # $8 Query String
|
||||
)
|
||||
)
|
||||
}iox
|
||||
end
|
||||
end
|
|
@ -89,6 +89,38 @@ RSpec.describe Formatter do
|
|||
end
|
||||
end
|
||||
|
||||
context 'matches a URL with Japanese path string' do
|
||||
let(:text) { 'https://ja.wikipedia.org/wiki/日本' }
|
||||
|
||||
it 'has valid URL' do
|
||||
is_expected.to include 'href="https://ja.wikipedia.org/wiki/%E6%97%A5%E6%9C%AC"'
|
||||
end
|
||||
end
|
||||
|
||||
context 'matches a URL with Korean path string' do
|
||||
let(:text) { 'https://ko.wikipedia.org/wiki/대한민국' }
|
||||
|
||||
it 'has valid URL' do
|
||||
is_expected.to include 'href="https://ko.wikipedia.org/wiki/%EB%8C%80%ED%95%9C%EB%AF%BC%EA%B5%AD"'
|
||||
end
|
||||
end
|
||||
|
||||
context 'matches a URL with Simplified Chinese path string' do
|
||||
let(:text) { 'https://baike.baidu.com/item/中华人民共和国' }
|
||||
|
||||
it 'has valid URL' do
|
||||
is_expected.to include 'href="https://baike.baidu.com/item/%E4%B8%AD%E5%8D%8E%E4%BA%BA%E6%B0%91%E5%85%B1%E5%92%8C%E5%9B%BD"'
|
||||
end
|
||||
end
|
||||
|
||||
context 'matches a URL with Traditional Chinese path string' do
|
||||
let(:text) { 'https://zh.wikipedia.org/wiki/臺灣' }
|
||||
|
||||
it 'has valid URL' do
|
||||
is_expected.to include 'href="https://zh.wikipedia.org/wiki/%E8%87%BA%E7%81%A3"'
|
||||
end
|
||||
end
|
||||
|
||||
context 'contains HTML (script tag)' do
|
||||
let(:text) { '<script>alert("Hello")</script>' }
|
||||
|
||||
|
|
|
@ -12,6 +12,8 @@ RSpec.describe FetchLinkCardService do
|
|||
stub_request(:get, 'http://example.com/sjis_with_wrong_charset').to_return(request_fixture('sjis_with_wrong_charset.txt'))
|
||||
stub_request(:head, 'http://example.com/koi8-r').to_return(status: 200, headers: { 'Content-Type' => 'text/html' })
|
||||
stub_request(:get, 'http://example.com/koi8-r').to_return(request_fixture('koi8-r.txt'))
|
||||
stub_request(:head, 'http://example.com/日本語').to_return(status: 200, headers: { 'Content-Type' => 'text/html' })
|
||||
stub_request(:get, 'http://example.com/日本語').to_return(request_fixture('sjis.txt'))
|
||||
stub_request(:head, 'https://github.com/qbi/WannaCry').to_return(status: 404)
|
||||
|
||||
subject.call(status)
|
||||
|
@ -52,6 +54,15 @@ RSpec.describe FetchLinkCardService do
|
|||
expect(status.preview_cards.first.title).to eq("Московя начинаетъ только въ XVI ст. привлекать внимане иностранцевъ.")
|
||||
end
|
||||
end
|
||||
|
||||
context do
|
||||
let(:status) { Fabricate(:status, text: 'テストhttp://example.com/日本語') }
|
||||
|
||||
it 'works with Japanese path string' do
|
||||
expect(a_request(:get, 'http://example.com/日本語')).to have_been_made.at_least_once
|
||||
expect(status.preview_cards.first.title).to eq("SJISのページ")
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
context 'in a remote status' do
|
||||
|
|
Loading…
Reference in a new issue