Fix inefficiencies in auto-linking code (#16506)
The auto-linking code basically rewrote the whole string escaping non-ascii characters in an inefficient way, and building a full character offset map between the unescaped and escaped texts before sending the contents to TwitterText's extractor. Instead of doing that, this commit changes the TwitterText regexps to include valid IRI characters in addition to valid URI characters.
This commit is contained in:
parent
3dcf3f2a3a
commit
211d5c3c30
2 changed files with 5 additions and 30 deletions
|
@ -214,39 +214,10 @@ class Formatter
|
||||||
result.flatten.join
|
result.flatten.join
|
||||||
end
|
end
|
||||||
|
|
||||||
UNICODE_ESCAPE_BLACKLIST_RE = /\p{Z}|\p{P}/
|
|
||||||
|
|
||||||
def utf8_friendly_extractor(text, options = {})
|
def utf8_friendly_extractor(text, options = {})
|
||||||
old_to_new_index = [0]
|
|
||||||
|
|
||||||
escaped = text.chars.map do |c|
|
|
||||||
output = begin
|
|
||||||
if c.ord.to_s(16).length > 2 && !UNICODE_ESCAPE_BLACKLIST_RE.match?(c)
|
|
||||||
CGI.escape(c)
|
|
||||||
else
|
|
||||||
c
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
old_to_new_index << old_to_new_index.last + output.length
|
|
||||||
|
|
||||||
output
|
|
||||||
end.join
|
|
||||||
|
|
||||||
# Note: I couldn't obtain list_slug with @user/list-name format
|
# Note: I couldn't obtain list_slug with @user/list-name format
|
||||||
# for mention so this requires additional check
|
# for mention so this requires additional check
|
||||||
special = Extractor.extract_urls_with_indices(escaped, options).map do |extract|
|
special = Extractor.extract_urls_with_indices(text, options)
|
||||||
new_indices = [
|
|
||||||
old_to_new_index.find_index(extract[:indices].first),
|
|
||||||
old_to_new_index.find_index(extract[:indices].last),
|
|
||||||
]
|
|
||||||
|
|
||||||
next extract.merge(
|
|
||||||
indices: new_indices,
|
|
||||||
url: text[new_indices.first..new_indices.last - 1]
|
|
||||||
)
|
|
||||||
end
|
|
||||||
|
|
||||||
standard = Extractor.extract_entities_with_indices(text, options)
|
standard = Extractor.extract_entities_with_indices(text, options)
|
||||||
extra = Extractor.extract_extra_uris_with_indices(text, options)
|
extra = Extractor.extract_extra_uris_with_indices(text, options)
|
||||||
|
|
||||||
|
|
|
@ -24,6 +24,10 @@ module Twitter::TwitterText
|
||||||
)
|
)
|
||||||
\)
|
\)
|
||||||
/iox
|
/iox
|
||||||
|
REGEXEN[:valid_iri_ucschar] = /[\u{A0}-\u{D7FF}\u{F900}-\u{FDCF}\u{FDF0}-\u{FFEF}\u{10000}-\u{1FFFD}\u{20000}-\u{2FFFD}\u{30000}-\u{3FFFD}\u{40000}-\u{4FFFD}\u{50000}-\u{5FFFD}\u{60000}-\u{6FFFD}\u{70000}-\u{7FFFD}\u{80000}-\u{8FFFD}\u{90000}-\u{9FFFD}\u{A0000}-\u{AFFFD}\u{B0000}-\u{BFFFD}\u{C0000}-\u{CFFFD}\u{D0000}-\u{DFFFD}\u{E1000}-\u{EFFFD}]/iou
|
||||||
|
REGEXEN[:valid_iri_iprivate] = /[\u{E000}-\u{F8FF}\u{F0000}-\u{FFFFD}\u{100000}-\u{10FFFD}]/iou
|
||||||
|
REGEXEN[:valid_url_query_chars] = /(?:#{REGEXEN[:valid_iri_ucschar]})|(?:#{REGEXEN[:valid_iri_iprivate]})|[a-z0-9!?\*'\(\);:&=\+\$\/%#\[\]\-_\.,~|@]/iou
|
||||||
|
REGEXEN[:valid_url_query_ending_chars] = /(?:#{REGEXEN[:valid_iri_ucschar]})|(?:#{REGEXEN[:valid_iri_iprivate]})|[a-z0-9_&=#\/\-]/iou
|
||||||
REGEXEN[:valid_url_path] = /(?:
|
REGEXEN[:valid_url_path] = /(?:
|
||||||
(?:
|
(?:
|
||||||
#{REGEXEN[:valid_general_url_path_chars]}*
|
#{REGEXEN[:valid_general_url_path_chars]}*
|
||||||
|
|
Loading…
Reference in a new issue