Use stricter boundaries for auto-link detection (#6522)

* Use stricter boundaries for auto-link detection

Currently autolinks use \W for boundary detection which creates many
situations of inserting links into places they don't belong (paths,
URLs, UUIDs, etc...)

This fixes that by replacing \W and only allowing these matches to touch
an open paren or bracket (matching what seems to be Github behavior) in
addition to whitespace and start of line. Similar for ending boundary as
well.

Fixes #6149
(and probably others)

* Update test

Replace incorrect test with a value that is a valid username, based on:

"Username should contain only alphanumeric, dash ('-'), underscore ('_')
and dot ('.') characters."

* Also allow for period at the end

Matching Github behavior

* Fix email regex to work properly with specificed boundaries

Create a specific capture group for email address and then use
FindStringSubmatchIndex to allow for non-matching patterns as
boundaries.

* Add Tests

Add tests for new behavior -- including tests for email addresses which
were absent before.
This commit is contained in:
mrsdizzie 2019-04-07 07:18:16 -04:00 committed by zeripath
parent 5422f23ed8
commit 6293736d02
3 changed files with 70 additions and 11 deletions

View file

@ -35,20 +35,20 @@ var (
// TODO: fix invalid linking issue // TODO: fix invalid linking issue
// mentionPattern matches all mentions in the form of "@user" // mentionPattern matches all mentions in the form of "@user"
mentionPattern = regexp.MustCompile(`(?:\s|^|\W)(@[0-9a-zA-Z-_\.]+)`) mentionPattern = regexp.MustCompile(`(?:\s|^|\(|\[)(@[0-9a-zA-Z-_\.]+)(?:\s|$|\)|\])`)
// issueNumericPattern matches string that references to a numeric issue, e.g. #1287 // issueNumericPattern matches string that references to a numeric issue, e.g. #1287
issueNumericPattern = regexp.MustCompile(`(?:\s|^|\W)(#[0-9]+)\b`) issueNumericPattern = regexp.MustCompile(`(?:\s|^|\(|\[)(#[0-9]+)(?:\s|$|\)|\]|\.(\s|$))`)
// issueAlphanumericPattern matches string that references to an alphanumeric issue, e.g. ABC-1234 // issueAlphanumericPattern matches string that references to an alphanumeric issue, e.g. ABC-1234
issueAlphanumericPattern = regexp.MustCompile(`(?:\s|^|\W)([A-Z]{1,10}-[1-9][0-9]*)\b`) issueAlphanumericPattern = regexp.MustCompile(`(?:\s|^|\(|\[)([A-Z]{1,10}-[1-9][0-9]*)(?:\s|$|\)|\]|\.(\s|$))`)
// crossReferenceIssueNumericPattern matches string that references a numeric issue in a different repository // crossReferenceIssueNumericPattern matches string that references a numeric issue in a different repository
// e.g. gogits/gogs#12345 // e.g. gogits/gogs#12345
crossReferenceIssueNumericPattern = regexp.MustCompile(`(?:\s|^|\W)([0-9a-zA-Z-_\.]+/[0-9a-zA-Z-_\.]+#[0-9]+)\b`) crossReferenceIssueNumericPattern = regexp.MustCompile(`(?:\s|^|\(|\[)([0-9a-zA-Z-_\.]+/[0-9a-zA-Z-_\.]+#[0-9]+)(?:\s|$|\)|\]|\.(\s|$))`)
// sha1CurrentPattern matches string that represents a commit SHA, e.g. d8a994ef243349f321568f9e36d5c3f444b99cae // sha1CurrentPattern matches string that represents a commit SHA, e.g. d8a994ef243349f321568f9e36d5c3f444b99cae
// Although SHA1 hashes are 40 chars long, the regex matches the hash from 7 to 40 chars in length // Although SHA1 hashes are 40 chars long, the regex matches the hash from 7 to 40 chars in length
// so that abbreviated hash links can be used as well. This matches git and github useability. // so that abbreviated hash links can be used as well. This matches git and github useability.
sha1CurrentPattern = regexp.MustCompile(`(?:\s|^|\W)([0-9a-f]{7,40})\b`) sha1CurrentPattern = regexp.MustCompile(`(?:\s|^|\(|\[)([0-9a-f]{7,40})(?:\s|$|\)|\]|\.(\s|$))`)
// shortLinkPattern matches short but difficult to parse [[name|link|arg=test]] syntax // shortLinkPattern matches short but difficult to parse [[name|link|arg=test]] syntax
shortLinkPattern = regexp.MustCompile(`\[\[(.*?)\]\](\w*)`) shortLinkPattern = regexp.MustCompile(`\[\[(.*?)\]\](\w*)`)
@ -63,7 +63,7 @@ var (
// well as the HTML5 spec: // well as the HTML5 spec:
// http://spec.commonmark.org/0.28/#email-address // http://spec.commonmark.org/0.28/#email-address
// https://html.spec.whatwg.org/multipage/input.html#e-mail-state-(type%3Demail) // https://html.spec.whatwg.org/multipage/input.html#e-mail-state-(type%3Demail)
emailRegex = regexp.MustCompile("[a-zA-Z0-9.!#$%&'*+\\/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*") emailRegex = regexp.MustCompile("(?:\\s|^|\\(|\\[)([a-zA-Z0-9.!#$%&'*+\\/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*)(?:\\s|$|\\)|\\]|\\.(\\s|$))")
linkRegex, _ = xurls.StrictMatchingScheme("https?://") linkRegex, _ = xurls.StrictMatchingScheme("https?://")
) )
@ -656,12 +656,12 @@ func sha1CurrentPatternProcessor(ctx *postProcessCtx, node *html.Node) {
// emailAddressProcessor replaces raw email addresses with a mailto: link. // emailAddressProcessor replaces raw email addresses with a mailto: link.
func emailAddressProcessor(ctx *postProcessCtx, node *html.Node) { func emailAddressProcessor(ctx *postProcessCtx, node *html.Node) {
m := emailRegex.FindStringIndex(node.Data) m := emailRegex.FindStringSubmatchIndex(node.Data)
if m == nil { if m == nil {
return return
} }
mail := node.Data[m[0]:m[1]] mail := node.Data[m[2]:m[3]]
replaceContent(node, m[0], m[1], createLink("mailto:"+mail, mail)) replaceContent(node, m[2], m[3], createLink("mailto:"+mail, mail))
} }
// linkProcessor creates links for any HTTP or HTTPS URL not captured by // linkProcessor creates links for any HTTP or HTTPS URL not captured by

View file

@ -71,6 +71,7 @@ func TestRender_IssueIndexPattern(t *testing.T) {
test("test#1234") test("test#1234")
test("#1234test") test("#1234test")
test(" test #1234test") test(" test #1234test")
test("/home/gitea/#1234")
// should not render issue mention without leading space // should not render issue mention without leading space
test("test#54321 issue") test("test#54321 issue")
@ -103,9 +104,11 @@ func TestRender_IssueIndexPattern2(t *testing.T) {
test("#1234 test", "%s test", 1234) test("#1234 test", "%s test", 1234)
test("test #8 issue", "test %s issue", 8) test("test #8 issue", "test %s issue", 8)
test("test issue #1234", "test issue %s", 1234) test("test issue #1234", "test issue %s", 1234)
test("fixes issue #1234.", "fixes issue %s.", 1234)
// should render mentions in parentheses // should render mentions in parentheses / brackets
test("(#54321 issue)", "(%s issue)", 54321) test("(#54321 issue)", "(%s issue)", 54321)
test("[#54321 issue]", "[%s issue]", 54321)
test("test (#9801 extra) issue", "test (%s extra) issue", 9801) test("test (#9801 extra) issue", "test (%s extra) issue", 9801)
test("test (#1)", "test (%s)", 1) test("test (#1)", "test (%s)", 1)
@ -253,10 +256,14 @@ func TestRegExp_sha1CurrentPattern(t *testing.T) {
trueTestCases := []string{ trueTestCases := []string{
"d8a994ef243349f321568f9e36d5c3f444b99cae", "d8a994ef243349f321568f9e36d5c3f444b99cae",
"abcdefabcdefabcdefabcdefabcdefabcdefabcd", "abcdefabcdefabcdefabcdefabcdefabcdefabcd",
"(abcdefabcdefabcdefabcdefabcdefabcdefabcd)",
"[abcdefabcdefabcdefabcdefabcdefabcdefabcd]",
"abcdefabcdefabcdefabcdefabcdefabcdefabcd.",
} }
falseTestCases := []string{ falseTestCases := []string{
"test", "test",
"abcdefg", "abcdefg",
"e59ff077-2d03-4e6b-964d-63fbaea81f",
"abcdefghijklmnopqrstuvwxyzabcdefghijklmn", "abcdefghijklmnopqrstuvwxyzabcdefghijklmn",
"abcdefghijklmnopqrstuvwxyzabcdefghijklmO", "abcdefghijklmnopqrstuvwxyzabcdefghijklmO",
} }
@ -309,7 +316,9 @@ func TestRegExp_mentionPattern(t *testing.T) {
"@ANT_123", "@ANT_123",
"@xxx-DiN0-z-A..uru..s-xxx", "@xxx-DiN0-z-A..uru..s-xxx",
" @lol ", " @lol ",
" @Te/st", " @Te-st",
"(@gitea)",
"[@gitea]",
} }
falseTestCases := []string{ falseTestCases := []string{
"@ 0", "@ 0",
@ -317,6 +326,8 @@ func TestRegExp_mentionPattern(t *testing.T) {
"@", "@",
"", "",
"ABC", "ABC",
"/home/gitea/@gitea",
"\"@gitea\"",
} }
for _, testCase := range trueTestCases { for _, testCase := range trueTestCases {
@ -335,6 +346,9 @@ func TestRegExp_issueAlphanumericPattern(t *testing.T) {
"A-1", "A-1",
"RC-80", "RC-80",
"ABCDEFGHIJ-1234567890987654321234567890", "ABCDEFGHIJ-1234567890987654321234567890",
"ABC-123.",
"(ABC-123)",
"[ABC-123]",
} }
falseTestCases := []string{ falseTestCases := []string{
"RC-08", "RC-08",
@ -347,6 +361,8 @@ func TestRegExp_issueAlphanumericPattern(t *testing.T) {
"ABC", "ABC",
"GG-", "GG-",
"rm-1", "rm-1",
"/home/gitea/ABC-1234",
"MY-STRING-ABC-123",
} }
for _, testCase := range trueTestCases { for _, testCase := range trueTestCases {

View file

@ -36,6 +36,8 @@ func TestRender_Commits(t *testing.T) {
test(commit, `<p><a href="`+commit+`" rel="nofollow">b6dd6210ea</a></p>`) test(commit, `<p><a href="`+commit+`" rel="nofollow">b6dd6210ea</a></p>`)
test(tree, `<p><a href="`+tree+`" rel="nofollow">b6dd6210ea/src</a></p>`) test(tree, `<p><a href="`+tree+`" rel="nofollow">b6dd6210ea/src</a></p>`)
test("commit "+sha, `<p>commit <a href="`+commit+`" rel="nofollow">b6dd6210ea</a></p>`) test("commit "+sha, `<p>commit <a href="`+commit+`" rel="nofollow">b6dd6210ea</a></p>`)
test("/home/gitea/"+sha, "<p>/home/gitea/"+sha+"</p>")
} }
func TestRender_CrossReferences(t *testing.T) { func TestRender_CrossReferences(t *testing.T) {
@ -53,6 +55,9 @@ func TestRender_CrossReferences(t *testing.T) {
test( test(
"go-gitea/gitea#12345", "go-gitea/gitea#12345",
`<p><a href="`+util.URLJoin(AppURL, "go-gitea", "gitea", "issues", "12345")+`" rel="nofollow">go-gitea/gitea#12345</a></p>`) `<p><a href="`+util.URLJoin(AppURL, "go-gitea", "gitea", "issues", "12345")+`" rel="nofollow">go-gitea/gitea#12345</a></p>`)
test(
"/home/gitea/go-gitea/gitea#12345",
`<p>/home/gitea/go-gitea/gitea#12345</p>`)
} }
func TestMisc_IsSameDomain(t *testing.T) { func TestMisc_IsSameDomain(t *testing.T) {
@ -144,6 +149,44 @@ func TestRender_links(t *testing.T) {
`<p>www</p>`) `<p>www</p>`)
} }
func TestRender_email(t *testing.T) {
setting.AppURL = AppURL
setting.AppSubURL = AppSubURL
test := func(input, expected string) {
buffer := RenderString("a.md", input, setting.AppSubURL, nil)
assert.Equal(t, strings.TrimSpace(expected), strings.TrimSpace(string(buffer)))
}
// Text that should be turned into email link
test(
"info@gitea.com",
`<p><a href="mailto:info@gitea.com" rel="nofollow">info@gitea.com</a></p>`)
test(
"(info@gitea.com)",
`<p>(<a href="mailto:info@gitea.com" rel="nofollow">info@gitea.com</a>)</p>`)
test(
"[info@gitea.com]",
`<p>[<a href="mailto:info@gitea.com" rel="nofollow">info@gitea.com</a>]</p>`)
test(
"info@gitea.com.",
`<p><a href="mailto:info@gitea.com" rel="nofollow">info@gitea.com</a>.</p>`)
test(
"send email to info@gitea.co.uk.",
`<p>send email to <a href="mailto:info@gitea.co.uk" rel="nofollow">info@gitea.co.uk</a>.</p>`)
// Test that should *not* be turned into email links
test(
"\"info@gitea.com\"",
`<p>“info@gitea.com”</p>`)
test(
"/home/gitea/mailstore/info@gitea/com",
`<p>/home/gitea/mailstore/info@gitea/com</p>`)
test(
"git@try.gitea.io:go-gitea/gitea.git",
`<p>git@try.gitea.io:go-gitea/gitea.git</p>`)
}
func TestRender_ShortLinks(t *testing.T) { func TestRender_ShortLinks(t *testing.T) {
setting.AppURL = AppURL setting.AppURL = AppURL
setting.AppSubURL = AppSubURL setting.AppSubURL = AppSubURL