# HG changeset patch
# User FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
# Date 2017-01-22 18:17:38
# Node ID 168cc92c1b538d9cccf6f89dd08f43add8cc844f
# Parent  8b7c0ef62427ef354490f0cd195823b00c49d1e5

search: prevent pathname related conditions from removing "stop words"

Before this revision, pathname related conditions below cause
unintentional ignorance of "stop words".

  - path:,extension: (for "File contents" or "File names")
  - added:, removed:, changed: (for "Commit messages")

Therefore, pathname related conditions with "this", "a", "you", and so
on are completely ignored, even if they are valid pathname components.

To prevent pathname related conditions from removing "stop words",
this revision explicitly specifies "analyzer" for pathname related
fields of SCHEMA and CHGSETS_SCHEMA.

Difference between PATHANALYZER and default analyzer of TEXT is
whether "stop words" are preserved or not. Tokenization is still
applied on pathnames.

This revision requires full re-building index tables, because indexing
schemas are changed.

diff --git a/kallithea/lib/indexers/__init__.py b/kallithea/lib/indexers/__init__.py
--- a/kallithea/lib/indexers/__init__.py
+++ b/kallithea/lib/indexers/__init__.py
@@ -61,6 +61,14 @@ ICASEIDANALYZER = IDTokenizer() | Lowerc
 #
 IDANALYZER = IDTokenizer()
 
+# CUSTOM ANALYZER wordsplit + lowercase filter, for pathname-like text
+#
+# This is useful to:
+# - avoid removing "stop words" from text
+# - search case-insensitively
+#
+PATHANALYZER = RegexTokenizer() | LowercaseFilter()
+
 #INDEX SCHEMA DEFINITION
 SCHEMA = Schema(
     fileid=ID(unique=True),
@@ -68,11 +76,11 @@ SCHEMA = Schema(
     # this field preserves case of repository name for exact matching
     repository_rawname=TEXT(analyzer=IDANALYZER),
     repository=TEXT(stored=True, analyzer=ICASEIDANALYZER),
-    path=TEXT(stored=True),
+    path=TEXT(stored=True, analyzer=PATHANALYZER),
     content=FieldType(format=Characters(), analyzer=ANALYZER,
                       scorable=True, stored=True),
     modtime=STORED(),
-    extension=TEXT(stored=True)
+    extension=TEXT(stored=True, analyzer=PATHANALYZER)
 )
 
 IDX_NAME = 'HG_INDEX'
@@ -92,9 +100,9 @@ CHGSETS_SCHEMA = Schema(
     message=FieldType(format=Characters(), analyzer=ANALYZER,
                       scorable=True, stored=True),
     parents=TEXT(),
-    added=TEXT(),
-    removed=TEXT(),
-    changed=TEXT(),
+    added=TEXT(analyzer=PATHANALYZER),
+    removed=TEXT(analyzer=PATHANALYZER),
+    changed=TEXT(analyzer=PATHANALYZER),
 )
 
 CHGSET_IDX_NAME = 'CHGSET_INDEX'
diff --git a/kallithea/tests/functional/test_search_indexing.py b/kallithea/tests/functional/test_search_indexing.py
--- a/kallithea/tests/functional/test_search_indexing.py
+++ b/kallithea/tests/functional/test_search_indexing.py
@@ -156,19 +156,19 @@ class TestSearchControllerIndexing(TestC
 
         # confirm that there is no matching against lower name repository
         assert uname in response
-        #assert lname not in response
+        assert lname not in response
 
     @parametrize('searchtype,query,hit', [
-        ('content', 'path:this/is/it def test', 37),
-        ('commit', 'added:this/is/it bother to ask where', 4),
+        ('content', 'path:this/is/it def test', 1),
+        ('commit', 'added:this/is/it bother to ask where', 1),
         # this condition matches against files below, because
         # "path:" condition is also applied on "repository path".
         # - "this/is/it" in "stopword_test" repo
         # - "this_should_be_unique_filename.txt" in "this-is-it" repo
-        ('path', 'this/is/it', 0),
+        ('path', 'this/is/it', 2),
 
-        ('content', 'extension:us', 0),
-        ('path', 'extension:us', 0),
+        ('content', 'extension:us', 1),
+        ('path', 'extension:us', 1),
     ])
     def test_filename_stopword(self, searchtype, query, hit):
         response = self.app.get(url(controller='search', action='index'),