# HG changeset patch # User FUJIWARA Katsunori # Date 2017-01-22 18:17:38 # Node ID 168cc92c1b538d9cccf6f89dd08f43add8cc844f # Parent 8b7c0ef62427ef354490f0cd195823b00c49d1e5 search: prevent pathname related conditions from removing "stop words" Before this revision, pathname related conditions below cause unintentional ignorance of "stop words". - path:,extension: (for "File contents" or "File names") - added:, removed:, changed: (for "Commit messages") Therefore, pathname related conditions with "this", "a", "you", and so on are completely ignored, even if they are valid pathname components. To prevent pathname related conditions from removing "stop words", this revision explicitly specifies "analyzer" for pathname related fields of SCHEMA and CHGSETS_SCHEMA. Difference between PATHANALYZER and default analyzer of TEXT is whether "stop words" are preserved or not. Tokenization is still applied on pathnames. This revision requires full re-building index tables, because indexing schemas are changed. diff --git a/kallithea/lib/indexers/__init__.py b/kallithea/lib/indexers/__init__.py --- a/kallithea/lib/indexers/__init__.py +++ b/kallithea/lib/indexers/__init__.py @@ -61,6 +61,14 @@ ICASEIDANALYZER = IDTokenizer() | Lowerc # IDANALYZER = IDTokenizer() +# CUSTOM ANALYZER wordsplit + lowercase filter, for pathname-like text +# +# This is useful to: +# - avoid removing "stop words" from text +# - search case-insensitively +# +PATHANALYZER = RegexTokenizer() | LowercaseFilter() + #INDEX SCHEMA DEFINITION SCHEMA = Schema( fileid=ID(unique=True), @@ -68,11 +76,11 @@ SCHEMA = Schema( # this field preserves case of repository name for exact matching repository_rawname=TEXT(analyzer=IDANALYZER), repository=TEXT(stored=True, analyzer=ICASEIDANALYZER), - path=TEXT(stored=True), + path=TEXT(stored=True, analyzer=PATHANALYZER), content=FieldType(format=Characters(), analyzer=ANALYZER, scorable=True, stored=True), modtime=STORED(), - extension=TEXT(stored=True) + extension=TEXT(stored=True, analyzer=PATHANALYZER) ) IDX_NAME = 'HG_INDEX' @@ -92,9 +100,9 @@ CHGSETS_SCHEMA = Schema( message=FieldType(format=Characters(), analyzer=ANALYZER, scorable=True, stored=True), parents=TEXT(), - added=TEXT(), - removed=TEXT(), - changed=TEXT(), + added=TEXT(analyzer=PATHANALYZER), + removed=TEXT(analyzer=PATHANALYZER), + changed=TEXT(analyzer=PATHANALYZER), ) CHGSET_IDX_NAME = 'CHGSET_INDEX' diff --git a/kallithea/tests/functional/test_search_indexing.py b/kallithea/tests/functional/test_search_indexing.py --- a/kallithea/tests/functional/test_search_indexing.py +++ b/kallithea/tests/functional/test_search_indexing.py @@ -156,19 +156,19 @@ class TestSearchControllerIndexing(TestC # confirm that there is no matching against lower name repository assert uname in response - #assert lname not in response + assert lname not in response @parametrize('searchtype,query,hit', [ - ('content', 'path:this/is/it def test', 37), - ('commit', 'added:this/is/it bother to ask where', 4), + ('content', 'path:this/is/it def test', 1), + ('commit', 'added:this/is/it bother to ask where', 1), # this condition matches against files below, because # "path:" condition is also applied on "repository path". # - "this/is/it" in "stopword_test" repo # - "this_should_be_unique_filename.txt" in "this-is-it" repo - ('path', 'this/is/it', 0), + ('path', 'this/is/it', 2), - ('content', 'extension:us', 0), - ('path', 'extension:us', 0), + ('content', 'extension:us', 1), + ('path', 'extension:us', 1), ]) def test_filename_stopword(self, searchtype, query, hit): response = self.app.get(url(controller='search', action='index'),