From f64fbd9b74998f3ac8353d2a8344e2e6f0ce1936 Mon Sep 17 00:00:00 2001 From: Bruno Sofiato Date: Wed, 6 Nov 2024 17:51:20 -0300 Subject: [PATCH] Updated tokenizer to better matching when search for code snippets (#32261) This PR improves the accuracy of Gitea's code search. Currently, Gitea does not consider statements such as `onsole.log("hello")` as hits when the user searches for `log`. The culprit is how both ES and Bleve are tokenizing the file contents (in both cases, `console.log` is a whole token). In ES' case, we changed the tokenizer to [simple_pattern_split](https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-simplepatternsplit-tokenizer.html#:~:text=The%20simple_pattern_split%20tokenizer%20uses%20a,the%20tokenization%20is%20generally%20faster.). In such a case, tokens are words formed by digits and letters. In Bleve's case, it employs a [letter](https://blevesearch.com/docs/Tokenizers/) tokenizer. Resolves #32220 --------- Signed-off-by: Bruno Sofiato --- modules/indexer/code/bleve/bleve.go | 5 +- .../code/elasticsearch/elasticsearch.go | 13 ++++- modules/indexer/code/indexer_test.go | 49 ++++++++++++++++++ modules/indexer/internal/bleve/util.go | 9 ++-- modules/indexer/internal/bleve/util_test.go | 8 +++ .../org42/search-by-path.git/description | 5 +- .../org42/search-by-path.git/info/refs | 2 +- .../objects/info/commit-graph | Bin 1772 -> 0 bytes .../search-by-path.git/objects/info/packs | 2 +- ...29256bc27cb2ec73898507df710be7a3cf5.bitmap | Bin 674 -> 0 bytes ...3dc29256bc27cb2ec73898507df710be7a3cf5.idx | Bin 2080 -> 0 bytes ...3dc29256bc27cb2ec73898507df710be7a3cf5.rev | Bin 196 -> 0 bytes ...76cf6e2b46bc816936ab69306fb10aea571.bitmap | Bin 0 -> 678 bytes ...bef76cf6e2b46bc816936ab69306fb10aea571.idx | Bin 0 -> 2108 bytes ...f76cf6e2b46bc816936ab69306fb10aea571.pack} | Bin 6714 -> 6545 bytes ...bef76cf6e2b46bc816936ab69306fb10aea571.rev | Bin 0 -> 200 bytes .../org42/search-by-path.git/packed-refs | 2 +- 17 files changed, 83 insertions(+), 12 deletions(-) delete mode 100644 tests/gitea-repositories-meta/org42/search-by-path.git/objects/info/commit-graph delete mode 100644 tests/gitea-repositories-meta/org42/search-by-path.git/objects/pack/pack-393dc29256bc27cb2ec73898507df710be7a3cf5.bitmap delete mode 100644 tests/gitea-repositories-meta/org42/search-by-path.git/objects/pack/pack-393dc29256bc27cb2ec73898507df710be7a3cf5.idx delete mode 100644 tests/gitea-repositories-meta/org42/search-by-path.git/objects/pack/pack-393dc29256bc27cb2ec73898507df710be7a3cf5.rev create mode 100644 tests/gitea-repositories-meta/org42/search-by-path.git/objects/pack/pack-a7bef76cf6e2b46bc816936ab69306fb10aea571.bitmap create mode 100644 tests/gitea-repositories-meta/org42/search-by-path.git/objects/pack/pack-a7bef76cf6e2b46bc816936ab69306fb10aea571.idx rename tests/gitea-repositories-meta/org42/search-by-path.git/objects/pack/{pack-393dc29256bc27cb2ec73898507df710be7a3cf5.pack => pack-a7bef76cf6e2b46bc816936ab69306fb10aea571.pack} (74%) create mode 100644 tests/gitea-repositories-meta/org42/search-by-path.git/objects/pack/pack-a7bef76cf6e2b46bc816936ab69306fb10aea571.rev diff --git a/modules/indexer/code/bleve/bleve.go b/modules/indexer/code/bleve/bleve.go index 90e5e62bcb..772317fa59 100644 --- a/modules/indexer/code/bleve/bleve.go +++ b/modules/indexer/code/bleve/bleve.go @@ -31,6 +31,7 @@ import ( "github.com/blevesearch/bleve/v2/analysis/token/camelcase" "github.com/blevesearch/bleve/v2/analysis/token/lowercase" "github.com/blevesearch/bleve/v2/analysis/token/unicodenorm" + "github.com/blevesearch/bleve/v2/analysis/tokenizer/letter" "github.com/blevesearch/bleve/v2/analysis/tokenizer/unicode" "github.com/blevesearch/bleve/v2/mapping" "github.com/blevesearch/bleve/v2/search/query" @@ -69,7 +70,7 @@ const ( filenameIndexerAnalyzer = "filenameIndexerAnalyzer" filenameIndexerTokenizer = "filenameIndexerTokenizer" repoIndexerDocType = "repoIndexerDocType" - repoIndexerLatestVersion = 7 + repoIndexerLatestVersion = 8 ) // generateBleveIndexMapping generates a bleve index mapping for the repo indexer @@ -105,7 +106,7 @@ func generateBleveIndexMapping() (mapping.IndexMapping, error) { } else if err := mapping.AddCustomAnalyzer(repoIndexerAnalyzer, map[string]any{ "type": analyzer_custom.Name, "char_filters": []string{}, - "tokenizer": unicode.Name, + "tokenizer": letter.Name, "token_filters": []string{unicodeNormalizeName, camelcase.Name, lowercase.Name}, }); err != nil { return nil, err diff --git a/modules/indexer/code/elasticsearch/elasticsearch.go b/modules/indexer/code/elasticsearch/elasticsearch.go index 669a1bafcc..1c4dd39eff 100644 --- a/modules/indexer/code/elasticsearch/elasticsearch.go +++ b/modules/indexer/code/elasticsearch/elasticsearch.go @@ -30,7 +30,7 @@ import ( ) const ( - esRepoIndexerLatestVersion = 2 + esRepoIndexerLatestVersion = 3 // multi-match-types, currently only 2 types are used // Reference: https://www.elastic.co/guide/en/elasticsearch/reference/7.0/query-dsl-multi-match-query.html#multi-match-types esMultiMatchTypeBestFields = "best_fields" @@ -60,6 +60,10 @@ const ( "settings": { "analysis": { "analyzer": { + "content_analyzer": { + "tokenizer": "content_tokenizer", + "filter" : ["lowercase"] + }, "filename_path_analyzer": { "tokenizer": "path_tokenizer" }, @@ -68,6 +72,10 @@ const ( } }, "tokenizer": { + "content_tokenizer": { + "type": "simple_pattern_split", + "pattern": "[^a-zA-Z0-9]" + }, "path_tokenizer": { "type": "path_hierarchy", "delimiter": "/" @@ -104,7 +112,8 @@ const ( "content": { "type": "text", "term_vector": "with_positions_offsets", - "index": true + "index": true, + "analyzer": "content_analyzer" }, "commit_id": { "type": "keyword", diff --git a/modules/indexer/code/indexer_test.go b/modules/indexer/code/indexer_test.go index 5b33528dcd..020ccc72f8 100644 --- a/modules/indexer/code/indexer_test.go +++ b/modules/indexer/code/indexer_test.go @@ -181,6 +181,55 @@ func testIndexer(name string, t *testing.T, indexer internal.Indexer) { }, }, }, + // Search for matches on the contents of files regardless of case. + { + RepoIDs: nil, + Keyword: "dESCRIPTION", + Langs: 1, + Results: []codeSearchResult{ + { + Filename: "README.md", + Content: "# repo1\n\nDescription for repo1", + }, + }, + }, + // Search for an exact match on the filename within the repo '62' (case insenstive). + // This scenario yields a single result (the file avocado.md on the repo '62') + { + RepoIDs: []int64{62}, + Keyword: "AVOCADO.MD", + Langs: 1, + Results: []codeSearchResult{ + { + Filename: "avocado.md", + Content: "# repo1\n\npineaple pie of cucumber juice", + }, + }, + }, + // Search for matches on the contents of files when the criteria is a expression. + { + RepoIDs: []int64{62}, + Keyword: "console.log", + Langs: 1, + Results: []codeSearchResult{ + { + Filename: "example-file.js", + Content: "console.log(\"Hello, World!\")", + }, + }, + }, + // Search for matches on the contents of files when the criteria is part of a expression. + { + RepoIDs: []int64{62}, + Keyword: "log", + Langs: 1, + Results: []codeSearchResult{ + { + Filename: "example-file.js", + Content: "console.log(\"Hello, World!\")", + }, + }, + }, } for _, kw := range keywords { diff --git a/modules/indexer/internal/bleve/util.go b/modules/indexer/internal/bleve/util.go index b426b39bc2..a0c3dc4ad4 100644 --- a/modules/indexer/internal/bleve/util.go +++ b/modules/indexer/internal/bleve/util.go @@ -6,12 +6,13 @@ package bleve import ( "errors" "os" + "unicode" "code.gitea.io/gitea/modules/log" "code.gitea.io/gitea/modules/util" "github.com/blevesearch/bleve/v2" - "github.com/blevesearch/bleve/v2/analysis/tokenizer/unicode" + unicode_tokenizer "github.com/blevesearch/bleve/v2/analysis/tokenizer/unicode" "github.com/blevesearch/bleve/v2/index/upsidedown" "github.com/ethantkoenig/rupture" ) @@ -57,7 +58,7 @@ func openIndexer(path string, latestVersion int) (bleve.Index, int, error) { // may be different on two string and they still be considered equivalent. // Given a phrasse, its shortest word determines its fuzziness. If a phrase uses CJK (eg: `갃갃갃` `啊啊啊`), the fuzziness is zero. func GuessFuzzinessByKeyword(s string) int { - tokenizer := unicode.NewUnicodeTokenizer() + tokenizer := unicode_tokenizer.NewUnicodeTokenizer() tokens := tokenizer.Tokenize([]byte(s)) if len(tokens) > 0 { @@ -77,8 +78,10 @@ func guessFuzzinessByKeyword(s string) int { // according to https://github.com/blevesearch/bleve/issues/1563, the supported max fuzziness is 2 // magic number 4 was chosen to determine the levenshtein distance per each character of a keyword // BUT, when using CJK (eg: `갃갃갃` `啊啊啊`), it mismatches a lot. + // Likewise, queries whose terms contains characters that are *not* letters should not use fuzziness + for _, r := range s { - if r >= 128 { + if r >= 128 || !unicode.IsLetter(r) { return 0 } } diff --git a/modules/indexer/internal/bleve/util_test.go b/modules/indexer/internal/bleve/util_test.go index ae0b12c08d..8f7844464e 100644 --- a/modules/indexer/internal/bleve/util_test.go +++ b/modules/indexer/internal/bleve/util_test.go @@ -35,6 +35,14 @@ func TestBleveGuessFuzzinessByKeyword(t *testing.T) { Input: "갃갃갃", Fuzziness: 0, }, + { + Input: "repo1", + Fuzziness: 0, + }, + { + Input: "avocado.md", + Fuzziness: 0, + }, } for _, scenario := range scenarios { diff --git a/tests/gitea-repositories-meta/org42/search-by-path.git/description b/tests/gitea-repositories-meta/org42/search-by-path.git/description index 382e2d7f10..ffc40a9c48 100644 --- a/tests/gitea-repositories-meta/org42/search-by-path.git/description +++ b/tests/gitea-repositories-meta/org42/search-by-path.git/description @@ -4,5 +4,6 @@ This repository will be used to test code search. The snippet below shows its di ├── avocado.md ├── cucumber.md ├── ham.md -└── potato - └── ham.md +├── potato +| └── ham.md +└── example-file.js \ No newline at end of file diff --git a/tests/gitea-repositories-meta/org42/search-by-path.git/info/refs b/tests/gitea-repositories-meta/org42/search-by-path.git/info/refs index 6b948c96a8..4adf83dda3 100644 --- a/tests/gitea-repositories-meta/org42/search-by-path.git/info/refs +++ b/tests/gitea-repositories-meta/org42/search-by-path.git/info/refs @@ -3,7 +3,7 @@ 65f1bf27bc3bf70f64657658635e66094edbcb4d refs/heads/develop 65f1bf27bc3bf70f64657658635e66094edbcb4d refs/heads/feature/1 78fb907e3a3309eae4fe8fef030874cebbf1cd5e refs/heads/home-md-img-check -3731fe53b763859aaf83e703ee731f6b9447ff1e refs/heads/master +9f894b61946fd2f7b8b9d8e370e4d62f915522f5 refs/heads/master 62fb502a7172d4453f0322a2cc85bddffa57f07a refs/heads/pr-to-update 4649299398e4d39a5c09eb4f534df6f1e1eb87cc refs/heads/sub-home-md-img-check 3fa2f829675543ecfc16b2891aebe8bf0608a8f4 refs/notes/commits diff --git a/tests/gitea-repositories-meta/org42/search-by-path.git/objects/info/commit-graph b/tests/gitea-repositories-meta/org42/search-by-path.git/objects/info/commit-graph deleted file mode 100644 index b38715bb92b034596ebd83954969d4ac88da755e..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1772 zcmZ>E5Aa}QWMT04ba7*V02d(J2f}1=advSGfv{N>++7@vAZ)fZ5E?|X-9WI1C5sX0 zD}0y1Lcr03y@C@%nCFIhS`8@7-k2uVVDERrWWP|nng>@1vDQY<_5}-;z1e) z=7#@*weZfl*icC+h}t;hc+{SMG7EV|-q&#biQOzYnJzrsIMrGDJ6zQ_7I ze@bduL~>jjr{C? zX|=y*#4|JAUiy5q_lvZ~ITxcY-}uV$?#j-(IxEjjTmj7K`=`1r|2(gEdustOF@i7< zu%t78Y`wlWy61NWzZ;+Vw!byseKh=7CrTizN#wg5B$jMsak*fL9mPsBt z5f9YQ0X2swg)9BuovQAI>i=F8+|4i+IG4uzFVfgb8mNXHs)i#@c&~{5)1>B@thCv+_BT)s3tWxDhV5GVvTg-MU3vm6^Gu?zFDi5EezgrG4^_hw zaqeCH=U@CW$EW}CVOqFj-^qL43Ppcz-$!;&3d0Otj=*iQ^LrM&d}b#8P?CK{--~XU zWyoq$A6iRKnWQH)^;m^k?L$fH>Xx-#969PgQQb4^IFL^?n4!sWr&=?L$zMr$c~6w; J6d~FCUjTa)2z>wm diff --git a/tests/gitea-repositories-meta/org42/search-by-path.git/objects/info/packs b/tests/gitea-repositories-meta/org42/search-by-path.git/objects/info/packs index b2af8c8378..9774923d2e 100644 --- a/tests/gitea-repositories-meta/org42/search-by-path.git/objects/info/packs +++ b/tests/gitea-repositories-meta/org42/search-by-path.git/objects/info/packs @@ -1,2 +1,2 @@ -P pack-393dc29256bc27cb2ec73898507df710be7a3cf5.pack +P pack-a7bef76cf6e2b46bc816936ab69306fb10aea571.pack diff --git a/tests/gitea-repositories-meta/org42/search-by-path.git/objects/pack/pack-393dc29256bc27cb2ec73898507df710be7a3cf5.bitmap b/tests/gitea-repositories-meta/org42/search-by-path.git/objects/pack/pack-393dc29256bc27cb2ec73898507df710be7a3cf5.bitmap deleted file mode 100644 index 1fdef225e830cf65fa66e30e13f64bcc31b5feb8..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 674 zcmZ?r4Dn@PWME}rVBog2Jv1q7kNRo7;}$alYQGEYtFrkD5(i=?C!2TaXGl&Ce zRJr;eP$>o#G&%nN4KOJ%&4|Kd0K0*K!2u|WVj_Ek1Wcs}SO=g`C wvVrC%>~g5o%UlPOhtVzm3=C5o7#PYeFQr{vDB+MdI5Or2 z@_DF$d4XoYFdvW&!~8%t3=06+=vWXaMh+GN>O;oDKt3)k0+b^aivjf@V{srK8A|~9 zzYYU}c~9mPsBt5wE2$>sCN_^G9q`r_}2g z``I{Fd~vWXJH~Q4Y_f8jAKQZJ)+=d~Jeb@(H7Cz_a(Pw^=WG99-)|ouzHUF`Wm;l( zQ+4af5()W_S~FcO?jBu~>Fp7?Ht>E{?R?dfl_rsuuUuk*hF8|rQ2F3t=;UpWb5(2NxuWM3X86|+A}LHI@7xM{;%*4RjD8M ztM9S?&YzN67Lgp6#_4zav~Skpr50g=M_+fgot!2$$7H3=uC>1_eov^gGUj~s{aW-Tex-}GCn)u zAmelq)!8b)8w+J!#Mbajq}BeKaOXnpg7S;c&n6gc)s?SYryVc3Y04x$p{d6z)M_6} zT35HM?c&H$|2ZR`neq10=bOD>q&3dD7-uStP>^fRCO;@|M#Nc zZicbIxisE?k;YcicOy%5_|u9e%4?^bURo%1fOYa)qsPX*%cF!kBYh1u^nEKoc%Cu) zWBU0Qf6Vdee|(r0?$~$op0`5LpWFA70zO2gaZY}vV5_?=vVPaU`Qf*lyX{I`wrskk zz{_m0o{z1O|MJhvZ@*r(T-SY}>FmvFk(Wmn*6b>I*t$14jdkOk#^apJdsSHGJ4CL@ zZ(Fj);Kl7bR$DAxoXwB->Xq2O-5wut@8Ofn&t6LY>AEOjd~)6;k+X`1C#v$E9Z_8O zo?+K1t0n7i)GvS7<=SUclMrwMSoB>67Qeq37(^cdi(M}u4a)z(dPEc`$6N-KuLM@F z!19tW7sw7}VBj_eivI>yo$GeE+Wg?F|5;{u!wN diff --git a/tests/gitea-repositories-meta/org42/search-by-path.git/objects/pack/pack-a7bef76cf6e2b46bc816936ab69306fb10aea571.bitmap b/tests/gitea-repositories-meta/org42/search-by-path.git/objects/pack/pack-a7bef76cf6e2b46bc816936ab69306fb10aea571.bitmap new file mode 100644 index 0000000000000000000000000000000000000000..39c02c29877a8809da1e67f7cb474efab663d5cc GIT binary patch literal 678 zcmZ?r4Dn@PWME}rVBlW9?|aU-M_aN_h)vGgHks|Wz`CV{AaNjOf?^;A0_^`GG=n&h zMwP4o0hMAa;VhHT!*Z-#h-y;iUR{f&D&QS9G0B;)$#VY;i3t~Gku?h G76SlJfIs~J literal 0 HcmV?d00001 diff --git a/tests/gitea-repositories-meta/org42/search-by-path.git/objects/pack/pack-a7bef76cf6e2b46bc816936ab69306fb10aea571.idx b/tests/gitea-repositories-meta/org42/search-by-path.git/objects/pack/pack-a7bef76cf6e2b46bc816936ab69306fb10aea571.idx new file mode 100644 index 0000000000000000000000000000000000000000..38d0e6b72265f254a87cf9777741ddcead7f60da GIT binary patch literal 2108 zcmb`IYc$k(7{~w2#f+Ab+=kpql(8A)S|N^0L$j3JA~dyBZj*9bsn}zrb!kgw4I0!g zV?$HyDr?J_n2s7-ggO(;FxgC7&ARSBUO30;9B0|>#pnFL-{6hI~Gv@y@(>{MF^ljMhR;F zg|#rZ6lL(su@2r=iz>`4MGgE?)WI)@2D~jo1ichZ@LDTiJ@hP~y$W=occth;?`qJ4 z{zddbe~bat441(O>i-90m|q>o6$j*=I)`@2u~F19yHr_&+s!yCN1J)BY;43vT}YHG zJ}=5$U!|OhVJFhAw6Qu}j;q>Ewukt?VyH!0+6v-W&Qp00{YWkstBChI!ptnR-QyA% zBqemIRq(TlB=_8L&bAxsz5Z4jUM=^9NhB6i8(+e3*k19M>7hNYuV>Zjb2P=Gdsy7{ zcRQSt+R>eRiwv`Uuw|)PJ+wj(S$9v)}p7rCE? z_v!DXL|?14-MjXII47HTQM1JAhGTQ>%*mPjuk6YA@!|Pz-k@;wM?XpU`_m{AReutO zi@d!=EzOU8>R3{YKJ$!FKAGi?vM0%&GcPMW7sYSsagZC9f6V-|B$?6TXiy3TBtz<`Q8ERGG zi4L9oCPtckeK6YWnCa2$;w#kwt>Y0ok==8#zxtIQpW_lsL$!a*#$Fa59^@nQQbRUx zov*o9kWqd3zJ&Ng8ZLEfP{({nTzNgg>4Poz08r}=%UDr5J{)0*kRtd>X`=4MGI7hj!cgl6sttclHL z)qF1MAFyw<+v&QMn@6HMO>|}LK-SfG&E5{%_d+-<7;&9T0nF5W|}wI_=<_iXbXqfQForJq2;u4QCm0L*WGI z8U&Gdg3}7O57h)G;0{D|A-s!V2C`bE0si@X*k=v(Aq2rrfrtB|@*LvS6o`fR_kb^C z>DqaypN3wzGs*(ct}p|6v=HCQtm?bcBOhKh#&oC^MRSUe2G|7Mzw;p5K zOfk?}NmB6m7CeC45AXM?3KOP)fmx@Ja-K8dqDypMbY|;mahYTgM@EkR+$$VStA(57 zT(vcO*N<%PRj>2DjbJG;4}l`X1qsodbt}E=KNnqM@}7qg)A`ex$E)-rEBprL3%XP^ zbF+p4xDW!X3zI++O=nUo5_1c3QgzcZb5ixPiWwF?e!IN*Sf%*pv}yJ0Jf$mbOc?{A zN;493L54jx?p+=w)EVh(sG;v$`N8vy*&kCA10YZ+$S+AO$!FNRKYaGjYUa5q?-{wG$xqi%@<`3e$=6W`&o9bJQB=|d i0FxgHVza3geFPMzzV~eQ;zhXZIIn=sz4Nl@8>Dz{-o^;KMo}ANkQhw zSedp8R=pc!L`t5@{TzA!d--M10t?3hfKS+M%ZQ0WP|IgQT;whk{Zb1cNX z0B{x%P8HSD^&x7%UQcPKJm(4A>YzOMUm1WM3tfl_3f|fWKdz3Ef3c9b5CWhHlS&ay zKr#|@K`I^__b!hT>WuU?)X?{>{NQ=U?2oC50T3t@k+yb%{eFPLaJ;IV!yeG>p$2gc!efJQ)dOY>^ZbB?plWV}z3+Po;MC>9KArWI2i@EN=s5^tj+r*bV z`KEU|8P9|ei9ee!BMSpEbmU+`gMvI9G!eH+4A4Oj1(+zJixgTYp^84zkf0)iHsV## oKn?YM@L9f}m&M)k=T~3;8poZh9M-`j3bobdH9ViT(&%P-Klc