diff --git a/kallithea/controllers/search.py b/kallithea/controllers/search.py --- a/kallithea/controllers/search.py +++ b/kallithea/controllers/search.py @@ -94,7 +94,9 @@ class SearchController(BaseRepoControlle qp = QueryParser(search_type, schema=schema_defn) if c.repo_name: - cur_query = u'repository:%s %s' % (c.repo_name, cur_query) + # use "repository_rawname:" instead of "repository:" + # for case-sensitive matching + cur_query = u'repository_rawname:%s %s' % (c.repo_name, cur_query) try: query = qp.parse(unicode(cur_query)) # extract words for highlight diff --git a/kallithea/lib/indexers/__init__.py b/kallithea/lib/indexers/__init__.py --- a/kallithea/lib/indexers/__init__.py +++ b/kallithea/lib/indexers/__init__.py @@ -53,10 +53,20 @@ ANALYZER = RegexTokenizer(expression=r"\ # ICASEIDANALYZER = IDTokenizer() | LowercaseFilter() +# CUSTOM ANALYZER raw-string +# +# This is useful to: +# - avoid tokenization +# - avoid removing "stop words" from text +# +IDANALYZER = IDTokenizer() + #INDEX SCHEMA DEFINITION SCHEMA = Schema( fileid=ID(unique=True), owner=TEXT(), + # this field preserves case of repository name for exact matching + repository_rawname=TEXT(analyzer=IDANALYZER), repository=TEXT(stored=True, analyzer=ICASEIDANALYZER), path=TEXT(stored=True), content=FieldType(format=Characters(), analyzer=ANALYZER, @@ -74,7 +84,10 @@ CHGSETS_SCHEMA = Schema( date=NUMERIC(stored=True), last=BOOLEAN(), owner=TEXT(), - repository=ID(unique=True, stored=True), + # this field preserves case of repository name for exact matching + # and unique-ness in index table + repository_rawname=ID(unique=True), + repository=ID(stored=True, analyzer=ICASEIDANALYZER), author=TEXT(stored=True), message=FieldType(format=Characters(), analyzer=ANALYZER, scorable=True, stored=True), diff --git a/kallithea/lib/indexers/daemon.py b/kallithea/lib/indexers/daemon.py --- a/kallithea/lib/indexers/daemon.py +++ b/kallithea/lib/indexers/daemon.py @@ -203,6 +203,7 @@ class WhooshIndexingDaemon(object): writer.add_document( fileid=p, owner=unicode(repo.contact), + repository_rawname=repo.name_unicode, repository=safe_unicode(repo_name), path=p, content=u_content, @@ -241,6 +242,7 @@ class WhooshIndexingDaemon(object): raw_id=unicode(cs.raw_id), owner=unicode(repo.contact), date=cs._timestamp, + repository_rawname=repo.name_unicode, repository=safe_unicode(repo_name), author=cs.author, message=cs.message, diff --git a/kallithea/tests/functional/test_search_indexing.py b/kallithea/tests/functional/test_search_indexing.py --- a/kallithea/tests/functional/test_search_indexing.py +++ b/kallithea/tests/functional/test_search_indexing.py @@ -126,9 +126,9 @@ class TestSearchControllerIndexing(TestC response.mustcontain('>%d results' % hit) @parametrize('searchtype,query,hit', [ - ('content', 'this_should_be_unique_content', 2), + ('content', 'this_should_be_unique_content', 1), ('commit', 'this_should_be_unique_commit_log', 1), - ('path', 'this_should_be_unique_filename.txt', 2), + ('path', 'this_should_be_unique_filename.txt', 1), ]) def test_repository_case_sensitivity(self, searchtype, query, hit): self.log_user() @@ -142,7 +142,7 @@ class TestSearchControllerIndexing(TestC response = self.app.get(url(controller='search', action='index'), {'q': q, 'type': searchtype}) - response.mustcontain('>%d results' % hit) + response.mustcontain('>%d results' % (hit * 2)) # (2) on the other hand, searching under the specific # repository should return results only for that repository,