From 97a7c04a8fc4747d32af84fca3d068425ab33768 Mon Sep 17 00:00:00 2001 From: Giteabot Date: Wed, 1 May 2024 20:59:59 +0800 Subject: [PATCH] Fix bleve fuzziness (#30799) (#30804) Backport #30799 by wxiaoguang Fix #30797 Fix #30317 Co-authored-by: wxiaoguang --- modules/indexer/code/bleve/bleve.go | 4 +--- modules/indexer/internal/bleve/util.go | 12 ++++++++++++ modules/indexer/issues/bleve/bleve.go | 8 ++------ routers/web/repo/search.go | 2 +- 4 files changed, 16 insertions(+), 10 deletions(-) diff --git a/modules/indexer/code/bleve/bleve.go b/modules/indexer/code/bleve/bleve.go index bd844205a6..8056b58ec2 100644 --- a/modules/indexer/code/bleve/bleve.go +++ b/modules/indexer/code/bleve/bleve.go @@ -39,8 +39,6 @@ import ( const ( unicodeNormalizeName = "unicodeNormalize" maxBatchSize = 16 - // fuzzyDenominator determines the levenshtein distance per each character of a keyword - fuzzyDenominator = 4 ) func addUnicodeNormalizeTokenFilter(m *mapping.IndexMappingImpl) error { @@ -245,7 +243,7 @@ func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int phraseQuery.Analyzer = repoIndexerAnalyzer keywordQuery = phraseQuery if opts.IsKeywordFuzzy { - phraseQuery.Fuzziness = len(opts.Keyword) / fuzzyDenominator + phraseQuery.Fuzziness = inner_bleve.GuessFuzzinessByKeyword(opts.Keyword) } if len(opts.RepoIDs) > 0 { diff --git a/modules/indexer/internal/bleve/util.go b/modules/indexer/internal/bleve/util.go index 43a7c3c5ec..a2265f86e6 100644 --- a/modules/indexer/internal/bleve/util.go +++ b/modules/indexer/internal/bleve/util.go @@ -47,3 +47,15 @@ func openIndexer(path string, latestVersion int) (bleve.Index, int, error) { return index, 0, nil } + +func GuessFuzzinessByKeyword(s string) int { + // according to https://github.com/blevesearch/bleve/issues/1563, the supported max fuzziness is 2 + // magic number 4 was chosen to determine the levenshtein distance per each character of a keyword + // BUT, when using CJK (eg: `갃갃갃` `啊啊啊`), it mismatches a lot. + for _, r := range s { + if r >= 128 { + return 0 + } + } + return min(2, len(s)/4) +} diff --git a/modules/indexer/issues/bleve/bleve.go b/modules/indexer/issues/bleve/bleve.go index 1f54be721b..d7957b266a 100644 --- a/modules/indexer/issues/bleve/bleve.go +++ b/modules/indexer/issues/bleve/bleve.go @@ -35,11 +35,7 @@ func addUnicodeNormalizeTokenFilter(m *mapping.IndexMappingImpl) error { }) } -const ( - maxBatchSize = 16 - // fuzzyDenominator determines the levenshtein distance per each character of a keyword - fuzzyDenominator = 4 -) +const maxBatchSize = 16 // IndexerData an update to the issue indexer type IndexerData internal.IndexerData @@ -162,7 +158,7 @@ func (b *Indexer) Search(ctx context.Context, options *internal.SearchOptions) ( if options.Keyword != "" { fuzziness := 0 if options.IsFuzzyKeyword { - fuzziness = len(options.Keyword) / fuzzyDenominator + fuzziness = inner_bleve.GuessFuzzinessByKeyword(options.Keyword) } queries = append(queries, bleve.NewDisjunctionQuery([]query.Query{ diff --git a/routers/web/repo/search.go b/routers/web/repo/search.go index 23cf898630..d7854b2499 100644 --- a/routers/web/repo/search.go +++ b/routers/web/repo/search.go @@ -28,6 +28,7 @@ func Search(ctx *context.Context) { ctx.Data["Language"] = language ctx.Data["IsFuzzy"] = isFuzzy ctx.Data["PageIsViewCode"] = true + ctx.Data["IsRepoIndexerEnabled"] = setting.Indexer.RepoIndexerEnabled if keyword == "" { ctx.HTML(http.StatusOK, tplSearch) @@ -86,7 +87,6 @@ func Search(ctx *context.Context) { } } - ctx.Data["IsRepoIndexerEnabled"] = setting.Indexer.RepoIndexerEnabled ctx.Data["Repo"] = ctx.Repo.Repository ctx.Data["SearchResults"] = searchResults ctx.Data["SearchResultLanguages"] = searchResultLanguages