diff --git a/kallithea/lib/indexers/__init__.py b/kallithea/lib/indexers/__init__.py --- a/kallithea/lib/indexers/__init__.py +++ b/kallithea/lib/indexers/__init__.py @@ -61,6 +61,14 @@ ICASEIDANALYZER = IDTokenizer() | Lowerc # IDANALYZER = IDTokenizer() +# CUSTOM ANALYZER wordsplit + lowercase filter, for pathname-like text +# +# This is useful to: +# - avoid removing "stop words" from text +# - search case-insensitively +# +PATHANALYZER = RegexTokenizer() | LowercaseFilter() + #INDEX SCHEMA DEFINITION SCHEMA = Schema( fileid=ID(unique=True), @@ -68,11 +76,11 @@ SCHEMA = Schema( # this field preserves case of repository name for exact matching repository_rawname=TEXT(analyzer=IDANALYZER), repository=TEXT(stored=True, analyzer=ICASEIDANALYZER), - path=TEXT(stored=True), + path=TEXT(stored=True, analyzer=PATHANALYZER), content=FieldType(format=Characters(), analyzer=ANALYZER, scorable=True, stored=True), modtime=STORED(), - extension=TEXT(stored=True) + extension=TEXT(stored=True, analyzer=PATHANALYZER) ) IDX_NAME = 'HG_INDEX' @@ -92,9 +100,9 @@ CHGSETS_SCHEMA = Schema( message=FieldType(format=Characters(), analyzer=ANALYZER, scorable=True, stored=True), parents=TEXT(), - added=TEXT(), - removed=TEXT(), - changed=TEXT(), + added=TEXT(analyzer=PATHANALYZER), + removed=TEXT(analyzer=PATHANALYZER), + changed=TEXT(analyzer=PATHANALYZER), ) CHGSET_IDX_NAME = 'CHGSET_INDEX' diff --git a/kallithea/tests/functional/test_search_indexing.py b/kallithea/tests/functional/test_search_indexing.py --- a/kallithea/tests/functional/test_search_indexing.py +++ b/kallithea/tests/functional/test_search_indexing.py @@ -156,19 +156,19 @@ class TestSearchControllerIndexing(TestC # confirm that there is no matching against lower name repository assert uname in response - #assert lname not in response + assert lname not in response @parametrize('searchtype,query,hit', [ - ('content', 'path:this/is/it def test', 37), - ('commit', 'added:this/is/it bother to ask where', 4), + ('content', 'path:this/is/it def test', 1), + ('commit', 'added:this/is/it bother to ask where', 1), # this condition matches against files below, because # "path:" condition is also applied on "repository path". # - "this/is/it" in "stopword_test" repo # - "this_should_be_unique_filename.txt" in "this-is-it" repo - ('path', 'this/is/it', 0), + ('path', 'this/is/it', 2), - ('content', 'extension:us', 0), - ('path', 'extension:us', 0), + ('content', 'extension:us', 1), + ('path', 'extension:us', 1), ]) def test_filename_stopword(self, searchtype, query, hit): response = self.app.get(url(controller='search', action='index'),