Optimization of labels handling in issue_search (#4228)

This PR optimizes the SQL query and de-duplicate the labels' ids when generating the query string, on the issue page.

<hr/>

### Background

Some time ago, BingBot and some other crawlers have been putting my instance on its knees with requests containing a lot of label ids, like this one :

```
[07/Aug/2023:11:28:37 +0200] "GET /Dolibarr/sendrecurringinvoicebymail/issues?q=&type=all&sort=&state=closed&labels=1%2c1%2c1%2c1%2c1%2c1%2c1%2c1%2c1%2c1%2c1%2c1%2c1%2c1%2c1%2c1%2c1%2c1%2c1%2c1%2c2%2c10%2c2%2c1%2c1%2c10%2c10%2c7%2c6%2c10%2c10%2c3%2c2%2c1%2c5%2c10%2c1%2c6%2c2%2c7%2c3%2c7%2c6%2c10%2c1%2c10%2c1%2c1%2c7%2c7%2c1%2c1%2c1%2c1%2c10%2c10%2c1%2c2%2c1%2c1%2c1%2c1%2c1%2c1%2c1%2c1%2c1%2c1%2c2%2c1%2c12%2c6%2c6%2c10&milestone=0&project=-1&poster=0 HTTP/1.1" 499 0 "-" "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm) Chrome/103.0.5060.134 Safari/537.36"
```

Since each of the label ids implies a join, it grows exponentially expensive for the database engine (at least on PostgreSQL but SQLite suffers a little too).

Thus, this PR proposes two enhancements:

* rewrite the database query to use only one squashed condition,
* deduplicate the label ids when generating the URL.

### Performance comparison

Here are some timings on Postgresql-backed, Forgejo 7.0.4 instances :
```sh
$ time curl -s -o /dev/null "http://localhost:3000/toto/tata/issues?q=&type=all&sort=&labels=19%2c25%2c19%2c25%2c19%2c25%2c19%2c25%2c19%2c25%2c19%2c25%2c19%2c25%2c19%2c25%2c19%2c25%2c19%2c25%2c19%2c25%2c19%2c25%2c19%2c25%2c19%2c25%2c19%2c25%2c19%2c25%2c19%2c25%2c19%2c25%2c19%2c25%2c19%2c25&state=open&milestone=0&project=0&assignee=0&poster=0"

real    0m10,491s
user    0m0,017s
sys     0m0,008s
```
...and with the patch:
```sh
$ time curl -s -o /dev/null "http://localhost:3000/toto/tata/issues?q=&type=all&sort=&labels=19%2c25%2c19%2c25%2c19%2c25%2c19%2c25%2c19%2c25%2c19%2c25%2c19%2c25%2c19%2c25%2c19%2c25%2c19%2c25%2c19%2c25%2c19%2c25%2c19%2c25%2c19%2c25%2c19%2c25%2c19%2c25%2c19%2c25%2c19%2c25%2c19%2c25%2c19%2c25&state=open&milestone=0&project=0&assignee=0&poster=0"

real    0m0,094s
user    0m0,012s
sys     0m0,013s
```

### Annex

This issue was originally proposed to [Gitea](https://github.com/go-gitea/gitea/pull/26460) but didn't get much attention, and I switched to Forgejo in the meantime :)

Reviewed-on: https://codeberg.org/forgejo/forgejo/pulls/4228
Reviewed-by: Earl Warren <earl-warren@noreply.codeberg.org>
Co-authored-by: Chl <chl@xlii.si>
Co-committed-by: Chl <chl@xlii.si>
This commit is contained in:
Chl 2024-06-28 05:11:57 +00:00 committed by Earl Warren
parent 2121a29f89
commit 544cbc6f01
4 changed files with 80 additions and 16 deletions

View file

@ -6,6 +6,7 @@ package issues
import ( import (
"context" "context"
"fmt" "fmt"
"strconv"
"strings" "strings"
"code.gitea.io/gitea/models/db" "code.gitea.io/gitea/models/db"
@ -13,6 +14,7 @@ import (
repo_model "code.gitea.io/gitea/models/repo" repo_model "code.gitea.io/gitea/models/repo"
"code.gitea.io/gitea/models/unit" "code.gitea.io/gitea/models/unit"
user_model "code.gitea.io/gitea/models/user" user_model "code.gitea.io/gitea/models/user"
"code.gitea.io/gitea/modules/container"
"code.gitea.io/gitea/modules/optional" "code.gitea.io/gitea/modules/optional"
"xorm.io/builder" "xorm.io/builder"
@ -116,14 +118,30 @@ func applyLabelsCondition(sess *xorm.Session, opts *IssuesOptions) {
if opts.LabelIDs[0] == 0 { if opts.LabelIDs[0] == 0 {
sess.Where("issue.id NOT IN (SELECT issue_id FROM issue_label)") sess.Where("issue.id NOT IN (SELECT issue_id FROM issue_label)")
} else { } else {
for i, labelID := range opts.LabelIDs { // deduplicate the label IDs for inclusion and exclusion
includedLabelIDs := make(container.Set[int64])
excludedLabelIDs := make(container.Set[int64])
for _, labelID := range opts.LabelIDs {
if labelID > 0 { if labelID > 0 {
sess.Join("INNER", fmt.Sprintf("issue_label il%d", i), includedLabelIDs.Add(labelID)
fmt.Sprintf("issue.id = il%[1]d.issue_id AND il%[1]d.label_id = %[2]d", i, labelID))
} else if labelID < 0 { // 0 is not supported here, so just ignore it } else if labelID < 0 { // 0 is not supported here, so just ignore it
sess.Where("issue.id not in (select issue_id from issue_label where label_id = ?)", -labelID) excludedLabelIDs.Add(-labelID)
} }
} }
// ... and use them in a subquery of the form :
// where (select count(*) from issue_label where issue_id=issue.id and label_id in (2, 4, 6)) = 3
// This equality is guaranteed thanks to unique index (issue_id,label_id) on table issue_label.
if len(includedLabelIDs) > 0 {
subQuery := builder.Select("count(*)").From("issue_label").Where(builder.Expr("issue_id = issue.id")).
And(builder.In("label_id", includedLabelIDs.Values()))
sess.Where(builder.Eq{strconv.Itoa(len(includedLabelIDs)): subQuery})
}
// or (select count(*)...) = 0 for excluded labels
if len(excludedLabelIDs) > 0 {
subQuery := builder.Select("count(*)").From("issue_label").Where(builder.Expr("issue_id = issue.id")).
And(builder.In("label_id", excludedLabelIDs.Values()))
sess.Where(builder.Eq{"0": subQuery})
}
} }
} }

View file

@ -191,6 +191,19 @@ func TestIssues(t *testing.T) {
}, },
[]int64{}, // issues with **both** label 1 and 2, none of these issues matches, TODO: add more tests []int64{}, // issues with **both** label 1 and 2, none of these issues matches, TODO: add more tests
}, },
{
issues_model.IssuesOptions{
LabelIDs: []int64{-1, 2},
},
[]int64{5}, // issue without label 1 but with label 2.
},
{
issues_model.IssuesOptions{
RepoCond: builder.In("repo_id", 1),
LabelIDs: []int64{0},
},
[]int64{11, 3}, // issues without any label (ordered by creation date desc.)(note: 11 is a pull request)
},
{ {
issues_model.IssuesOptions{ issues_model.IssuesOptions{
MilestoneIDs: []int64{1}, MilestoneIDs: []int64{1},

View file

@ -7,6 +7,7 @@ package issues
import ( import (
"context" "context"
"fmt" "fmt"
"slices"
"strconv" "strconv"
"strings" "strings"
@ -142,28 +143,38 @@ func (l *Label) CalOpenOrgIssues(ctx context.Context, repoID, labelID int64) {
// LoadSelectedLabelsAfterClick calculates the set of selected labels when a label is clicked // LoadSelectedLabelsAfterClick calculates the set of selected labels when a label is clicked
func (l *Label) LoadSelectedLabelsAfterClick(currentSelectedLabels []int64, currentSelectedExclusiveScopes []string) { func (l *Label) LoadSelectedLabelsAfterClick(currentSelectedLabels []int64, currentSelectedExclusiveScopes []string) {
var labelQuerySlice []string labelQuerySlice := []int64{}
labelSelected := false labelSelected := false
labelID := strconv.FormatInt(l.ID, 10) exclusiveScope := l.ExclusiveScope()
labelScope := l.ExclusiveScope() for i, curSel := range currentSelectedLabels {
for i, s := range currentSelectedLabels { if curSel == l.ID {
if s == l.ID {
labelSelected = true labelSelected = true
} else if -s == l.ID { } else if -curSel == l.ID {
labelSelected = true labelSelected = true
l.IsExcluded = true l.IsExcluded = true
} else if s != 0 { } else if curSel != 0 {
// Exclude other labels in the same scope from selection // Exclude other labels in the same scope from selection
if s < 0 || labelScope == "" || labelScope != currentSelectedExclusiveScopes[i] { if curSel < 0 || exclusiveScope == "" || exclusiveScope != currentSelectedExclusiveScopes[i] {
labelQuerySlice = append(labelQuerySlice, strconv.FormatInt(s, 10)) labelQuerySlice = append(labelQuerySlice, curSel)
} }
} }
} }
if !labelSelected { if !labelSelected {
labelQuerySlice = append(labelQuerySlice, labelID) labelQuerySlice = append(labelQuerySlice, l.ID)
} }
l.IsSelected = labelSelected l.IsSelected = labelSelected
l.QueryString = strings.Join(labelQuerySlice, ",")
// Sort and deduplicate the ids to avoid the crawlers asking for the
// same thing with simply a different order of parameters
slices.Sort(labelQuerySlice)
labelQuerySlice = slices.Compact(labelQuerySlice)
// Quick conversion (strings.Join() doesn't accept slices of Int64)
labelQuerySliceStrings := make([]string, len(labelQuerySlice))
for i, x := range labelQuerySlice {
labelQuerySliceStrings[i] = strconv.FormatInt(x, 10)
}
l.QueryString = strings.Join(labelQuerySliceStrings, ",")
} }
// BelongsToOrg returns true if label is an organization label // BelongsToOrg returns true if label is an organization label
@ -176,7 +187,7 @@ func (l *Label) BelongsToRepo() bool {
return l.RepoID > 0 return l.RepoID > 0
} }
// Return scope substring of label name, or empty string if none exists // ExclusiveScope returns scope substring of label name, or empty string if none exists
func (l *Label) ExclusiveScope() string { func (l *Label) ExclusiveScope() string {
if !l.Exclusive { if !l.Exclusive {
return "" return ""

View file

@ -23,6 +23,28 @@ func TestLabel_CalOpenIssues(t *testing.T) {
assert.EqualValues(t, 2, label.NumOpenIssues) assert.EqualValues(t, 2, label.NumOpenIssues)
} }
func TestLabel_LoadSelectedLabelsAfterClick(t *testing.T) {
assert.NoError(t, unittest.PrepareTestDatabase())
// Loading the label id:8 (scope/label2) which have a scope and an
// exclusivity with id:7 (scope/label1)
label := unittest.AssertExistsAndLoadBean(t, &issues_model.Label{ID: 8})
// First test : with negative and scope
label.LoadSelectedLabelsAfterClick([]int64{1, -8}, []string{"", "scope"})
assert.Equal(t, "1", label.QueryString)
assert.Equal(t, true, label.IsSelected)
// Second test : with duplicates
label.LoadSelectedLabelsAfterClick([]int64{1, 7, 1, 7, 7}, []string{"", "scope", "", "scope", "scope"})
assert.Equal(t, "1,8", label.QueryString)
assert.Equal(t, false, label.IsSelected)
// Third test : empty set
label.LoadSelectedLabelsAfterClick([]int64{}, []string{})
assert.False(t, label.IsSelected)
assert.Equal(t, "8", label.QueryString)
}
func TestLabel_ExclusiveScope(t *testing.T) { func TestLabel_ExclusiveScope(t *testing.T) {
assert.NoError(t, unittest.PrepareTestDatabase()) assert.NoError(t, unittest.PrepareTestDatabase())
label := unittest.AssertExistsAndLoadBean(t, &issues_model.Label{ID: 7}) label := unittest.AssertExistsAndLoadBean(t, &issues_model.Label{ID: 7})