kallithea Changeset - 8b7c0ef62427

Changeset - 8b7c0ef62427

Parent rev.

Child rev.

[Not reviewed]

default

0 4 0

FUJIWARA Katsunori - 9 years ago 2017-01-22 18:17:38
foozy@lares.dti.ne.jp

search: make "repository:" condition work case-insensitively as expected

Before this revision, "repository:" condition at searching for "Commit
messages" never shows revisions in a repository, of which name uses
upper case letter.

Using ID for "repository" of CHGSETS_SCHEMA preserves case of
repository name at indexing. On the other hand, search condition
itself is forcibly lowered before parsing.

- files in repository "FOO" is indexed as "FOO" in "repository" field
- "repository:FOO" condition is treated as "repository:foo:

Then, indexing search itself is executed case-sensitively. Therefore,
"repository:FOO" condition never show revisions in repository "FOO".

But just making "repository" of CHGSETS_SCHEMA case-insensitive isn't
reasonable enough, because it breaks assumptions below, if there is
case-insensitive name collision between repositories, even though
Kallithea itself can manage such repositories at same time.

- combination of "raw_id" (= revision hash ID) and "repository" is
unique between all known revisions under Kallithea

CHGSETS_SCHEMA assumes this.

This unique-ness is required by Whoosh library to determine
whether index table should be updated or not for that repository.

- searching in a repository shows only revisions in that repository

Before this revision, this filtering is achieve by "repository:"
condition with case-preserved repository name from requested URL.

To make "repository:" search condition work case-insensitively as
expected (without any violation of assumptions above), this revision
does:

- make "repository" of CHGSETS_SCHEMA case-insensitive by
"analyzer=ICASEIDANALYZER"

- introduce "repository_rawname" into SCHEMA and CHGSETS_SCHEMA, to
ensure assumptions described above, by preserving case of
repository name

"repository_rawname" of SCHEMA uses not ID but TEXT, because the
former disable "positions" feature, which is required for
highlight-ing file content (see previous revision for detail).

This revision requires full re-building index tables, because indexing
schemas are changed.

4 files changed with 22 insertions and 5 deletions:

kallithea/controllers/search.py

kallithea/lib/indexers/__init__.py

kallithea/lib/indexers/daemon.py

kallithea/tests/functional/test_search_indexing.py

0 comments (0 inline, 0 general)

kallithea/controllers/search.py

➞

Show inline comments

@@ @@ -85,25 +85,27 @@ class SearchController(BaseRepoControlle @@
             log.debug(cur_query)
         if c.cur_query:
             p = safe_int(request.GET.get('page'), 1)
             highlight_items = set()
             try:
                 idx = open_dir(config['app_conf']['index_dir'],
                                indexname=index_name)
                 searcher = idx.searcher()
                 qp = QueryParser(search_type, schema=schema_defn)
                 if c.repo_name:
                     cur_query = u'repository:%s %s' % (c.repo_name, cur_query)
                     # use "repository_rawname:" instead of "repository:"
                     # for case-sensitive matching
                     cur_query = u'repository_rawname:%s %s' % (c.repo_name, cur_query)
                 try:
                     query = qp.parse(unicode(cur_query))
                     # extract words for highlight
                     if isinstance(query, Phrase):
                         highlight_items.update(query.words)
                     elif isinstance(query, Prefix):
                         highlight_items.add(query.text)
                     else:
                         for i in query.all_terms():
                             if i[0] in ['content', 'message']:
                                 highlight_items.add(i[1])

kallithea/lib/indexers/__init__.py

➞

Show inline comments

@@ @@ -44,46 +44,59 @@ log = logging.getLogger(__name__) @@
 # CUSTOM ANALYZER wordsplit + lowercase filter
 ANALYZER = RegexTokenizer(expression=r"\w+") | LowercaseFilter()
 # CUSTOM ANALYZER raw-string + lowercase filter
+#
 # This is useful to:
 # - avoid tokenization
 # - avoid removing "stop words" from text
 # - search case-insensitively
+#
 ICASEIDANALYZER = IDTokenizer() | LowercaseFilter()
 # CUSTOM ANALYZER raw-string
+#
 # This is useful to:
 # - avoid tokenization
 # - avoid removing "stop words" from text
+#
 IDANALYZER = IDTokenizer()
 #INDEX SCHEMA DEFINITION
 SCHEMA = Schema(
     fileid=ID(unique=True),
     owner=TEXT(),
     # this field preserves case of repository name for exact matching
     repository_rawname=TEXT(analyzer=IDANALYZER),
     repository=TEXT(stored=True, analyzer=ICASEIDANALYZER),
     path=TEXT(stored=True),
     content=FieldType(format=Characters(), analyzer=ANALYZER,
                       scorable=True, stored=True),
     modtime=STORED(),
     extension=TEXT(stored=True)
+)
 IDX_NAME = 'HG_INDEX'
 FORMATTER = HtmlFormatter('span', between='\n<span class="break">...</span>\n')
 FRAGMENTER = ContextFragmenter(200)
 CHGSETS_SCHEMA = Schema(
     raw_id=ID(unique=True, stored=True),
     date=NUMERIC(stored=True),
     last=BOOLEAN(),
     owner=TEXT(),
     repository=ID(unique=True, stored=True),
     # this field preserves case of repository name for exact matching
     # and unique-ness in index table
     repository_rawname=ID(unique=True),
     repository=ID(stored=True, analyzer=ICASEIDANALYZER),
     author=TEXT(stored=True),
     message=FieldType(format=Characters(), analyzer=ANALYZER,
                       scorable=True, stored=True),
     parents=TEXT(),
     added=TEXT(),
     removed=TEXT(),
     changed=TEXT(),
+)
 CHGSET_IDX_NAME = 'CHGSET_INDEX'
 # used only to generate queries in journal

kallithea/lib/indexers/daemon.py

➞

Show inline comments

@@ @@ -194,24 +194,25 @@ class WhooshIndexingDaemon(object): @@
                 indexed_w_content += 1
         else:
             log.debug('    >> %s', path)
             # just index file name without it's content
             u_content = u''
             indexed += 1
         p = safe_unicode(path)
         writer.add_document(
             fileid=p,
             owner=unicode(repo.contact),
             repository_rawname=repo.name_unicode,
             repository=safe_unicode(repo_name),
             path=p,
             content=u_content,
             modtime=self.get_node_mtime(node),
             extension=node.extension
+        )
         return indexed, indexed_w_content
     def index_changesets(self, writer, repo_name, repo, start_rev=None):
         """
         Add all changeset in the vcs repo starting at start_rev
         to the index writer
@@ @@ -232,24 +233,25 @@ class WhooshIndexingDaemon(object): @@
         log.debug('indexing changesets in %s starting at rev: %s',
                   repo_name, start_rev)
         indexed = 0
         cs_iter = repo.get_changesets(start=start_rev)
         total = len(cs_iter)
         for cs in cs_iter:
             log.debug('    >> %s/%s', cs, total)
             writer.add_document(
                 raw_id=unicode(cs.raw_id),
                 owner=unicode(repo.contact),
                 date=cs._timestamp,
                 repository_rawname=repo.name_unicode,
                 repository=safe_unicode(repo_name),
                 author=cs.author,
                 message=cs.message,
                 last=cs.last,
                 added=u' '.join([safe_unicode(node.path) for node in cs.added]).lower(),
                 removed=u' '.join([safe_unicode(node.path) for node in cs.removed]).lower(),
                 changed=u' '.join([safe_unicode(node.path) for node in cs.changed]).lower(),
                 parents=u' '.join([cs.raw_id for cs in cs.parents]),
+            )
             indexed += 1
         log.debug('indexed %d changesets for repo %s', indexed, repo_name)

kallithea/tests/functional/test_search_indexing.py

➞

Show inline comments

@@ @@ -117,41 +117,41 @@ class TestSearchControllerIndexing(TestC @@
         ('commit', 'this_should_be_unique_commit_log', 1),
         ('path', 'this_should_be_unique_filename.txt', 1),
     ])
     def test_repository_tokenization(self, reponame, searchtype, query, hit):
         self.log_user()
         q = 'repository:%s %s' % (reponame, query)
         response = self.app.get(url(controller='search', action='index'),
                                 {'q': q, 'type': searchtype})
         response.mustcontain('>%d results' % hit)
     @parametrize('searchtype,query,hit', [
-        ('content', 'this_should_be_unique_content', 2),
+        ('content', 'this_should_be_unique_content', 1),
         ('commit', 'this_should_be_unique_commit_log', 1),
-        ('path', 'this_should_be_unique_filename.txt', 2),
+        ('path', 'this_should_be_unique_filename.txt', 1),
     ])
     def test_repository_case_sensitivity(self, searchtype, query, hit):
         self.log_user()
         lname = u'indexing_test-foo'
         uname = u'indexing_test-FOO'
         # (1) "repository:REPONAME" condition should match against
         # repositories case-insensitively
         q = 'repository:%s %s' % (lname, query)
         response = self.app.get(url(controller='search', action='index'),
                                 {'q': q, 'type': searchtype})
         response.mustcontain('>%d results' % hit)
+        response.mustcontain('>%d results' % (hit * 2))
         # (2) on the other hand, searching under the specific
         # repository should return results only for that repository,
         # even if specified name matches against another repository
         # case-insensitively.
         response = self.app.get(url(controller='search', action='index',
                                     repo_name=uname),
                                 {'q': query, 'type': searchtype})
         response.mustcontain('>%d results' % hit)
         # confirm that there is no matching against lower name repository

0 comments (0 inline, 0 general)