Changeset - 8b7c0ef62427
[Not reviewed]
default
0 4 0
FUJIWARA Katsunori - 9 years ago 2017-01-22 18:17:38
foozy@lares.dti.ne.jp
search: make "repository:" condition work case-insensitively as expected

Before this revision, "repository:" condition at searching for "Commit
messages" never shows revisions in a repository, of which name uses
upper case letter.

Using ID for "repository" of CHGSETS_SCHEMA preserves case of
repository name at indexing. On the other hand, search condition
itself is forcibly lowered before parsing.

- files in repository "FOO" is indexed as "FOO" in "repository" field
- "repository:FOO" condition is treated as "repository:foo:

Then, indexing search itself is executed case-sensitively. Therefore,
"repository:FOO" condition never show revisions in repository "FOO".

But just making "repository" of CHGSETS_SCHEMA case-insensitive isn't
reasonable enough, because it breaks assumptions below, if there is
case-insensitive name collision between repositories, even though
Kallithea itself can manage such repositories at same time.

- combination of "raw_id" (= revision hash ID) and "repository" is
unique between all known revisions under Kallithea

CHGSETS_SCHEMA assumes this.

This unique-ness is required by Whoosh library to determine
whether index table should be updated or not for that repository.

- searching in a repository shows only revisions in that repository

Before this revision, this filtering is achieve by "repository:"
condition with case-preserved repository name from requested URL.

To make "repository:" search condition work case-insensitively as
expected (without any violation of assumptions above), this revision
does:

- make "repository" of CHGSETS_SCHEMA case-insensitive by
"analyzer=ICASEIDANALYZER"

- introduce "repository_rawname" into SCHEMA and CHGSETS_SCHEMA, to
ensure assumptions described above, by preserving case of
repository name

"repository_rawname" of SCHEMA uses not ID but TEXT, because the
former disable "positions" feature, which is required for
highlight-ing file content (see previous revision for detail).

This revision requires full re-building index tables, because indexing
schemas are changed.
4 files changed with 22 insertions and 5 deletions:
0 comments (0 inline, 0 general)
kallithea/controllers/search.py
Show inline comments
 
@@ -85,25 +85,27 @@ class SearchController(BaseRepoControlle
 
            log.debug(cur_query)
 

	
 
        if c.cur_query:
 
            p = safe_int(request.GET.get('page'), 1)
 
            highlight_items = set()
 
            try:
 
                idx = open_dir(config['app_conf']['index_dir'],
 
                               indexname=index_name)
 
                searcher = idx.searcher()
 

	
 
                qp = QueryParser(search_type, schema=schema_defn)
 
                if c.repo_name:
 
                    cur_query = u'repository:%s %s' % (c.repo_name, cur_query)
 
                    # use "repository_rawname:" instead of "repository:"
 
                    # for case-sensitive matching
 
                    cur_query = u'repository_rawname:%s %s' % (c.repo_name, cur_query)
 
                try:
 
                    query = qp.parse(unicode(cur_query))
 
                    # extract words for highlight
 
                    if isinstance(query, Phrase):
 
                        highlight_items.update(query.words)
 
                    elif isinstance(query, Prefix):
 
                        highlight_items.add(query.text)
 
                    else:
 
                        for i in query.all_terms():
 
                            if i[0] in ['content', 'message']:
 
                                highlight_items.add(i[1])
 

	
kallithea/lib/indexers/__init__.py
Show inline comments
 
@@ -44,46 +44,59 @@ log = logging.getLogger(__name__)
 
# CUSTOM ANALYZER wordsplit + lowercase filter
 
ANALYZER = RegexTokenizer(expression=r"\w+") | LowercaseFilter()
 

	
 
# CUSTOM ANALYZER raw-string + lowercase filter
 
#
 
# This is useful to:
 
# - avoid tokenization
 
# - avoid removing "stop words" from text
 
# - search case-insensitively
 
#
 
ICASEIDANALYZER = IDTokenizer() | LowercaseFilter()
 

	
 
# CUSTOM ANALYZER raw-string
 
#
 
# This is useful to:
 
# - avoid tokenization
 
# - avoid removing "stop words" from text
 
#
 
IDANALYZER = IDTokenizer()
 

	
 
#INDEX SCHEMA DEFINITION
 
SCHEMA = Schema(
 
    fileid=ID(unique=True),
 
    owner=TEXT(),
 
    # this field preserves case of repository name for exact matching
 
    repository_rawname=TEXT(analyzer=IDANALYZER),
 
    repository=TEXT(stored=True, analyzer=ICASEIDANALYZER),
 
    path=TEXT(stored=True),
 
    content=FieldType(format=Characters(), analyzer=ANALYZER,
 
                      scorable=True, stored=True),
 
    modtime=STORED(),
 
    extension=TEXT(stored=True)
 
)
 

	
 
IDX_NAME = 'HG_INDEX'
 
FORMATTER = HtmlFormatter('span', between='\n<span class="break">...</span>\n')
 
FRAGMENTER = ContextFragmenter(200)
 

	
 
CHGSETS_SCHEMA = Schema(
 
    raw_id=ID(unique=True, stored=True),
 
    date=NUMERIC(stored=True),
 
    last=BOOLEAN(),
 
    owner=TEXT(),
 
    repository=ID(unique=True, stored=True),
 
    # this field preserves case of repository name for exact matching
 
    # and unique-ness in index table
 
    repository_rawname=ID(unique=True),
 
    repository=ID(stored=True, analyzer=ICASEIDANALYZER),
 
    author=TEXT(stored=True),
 
    message=FieldType(format=Characters(), analyzer=ANALYZER,
 
                      scorable=True, stored=True),
 
    parents=TEXT(),
 
    added=TEXT(),
 
    removed=TEXT(),
 
    changed=TEXT(),
 
)
 

	
 
CHGSET_IDX_NAME = 'CHGSET_INDEX'
 

	
 
# used only to generate queries in journal
kallithea/lib/indexers/daemon.py
Show inline comments
 
@@ -194,24 +194,25 @@ class WhooshIndexingDaemon(object):
 
                indexed_w_content += 1
 

	
 
        else:
 
            log.debug('    >> %s', path)
 
            # just index file name without it's content
 
            u_content = u''
 
            indexed += 1
 

	
 
        p = safe_unicode(path)
 
        writer.add_document(
 
            fileid=p,
 
            owner=unicode(repo.contact),
 
            repository_rawname=repo.name_unicode,
 
            repository=safe_unicode(repo_name),
 
            path=p,
 
            content=u_content,
 
            modtime=self.get_node_mtime(node),
 
            extension=node.extension
 
        )
 
        return indexed, indexed_w_content
 

	
 
    def index_changesets(self, writer, repo_name, repo, start_rev=None):
 
        """
 
        Add all changeset in the vcs repo starting at start_rev
 
        to the index writer
 
@@ -232,24 +233,25 @@ class WhooshIndexingDaemon(object):
 
        log.debug('indexing changesets in %s starting at rev: %s',
 
                  repo_name, start_rev)
 

	
 
        indexed = 0
 
        cs_iter = repo.get_changesets(start=start_rev)
 
        total = len(cs_iter)
 
        for cs in cs_iter:
 
            log.debug('    >> %s/%s', cs, total)
 
            writer.add_document(
 
                raw_id=unicode(cs.raw_id),
 
                owner=unicode(repo.contact),
 
                date=cs._timestamp,
 
                repository_rawname=repo.name_unicode,
 
                repository=safe_unicode(repo_name),
 
                author=cs.author,
 
                message=cs.message,
 
                last=cs.last,
 
                added=u' '.join([safe_unicode(node.path) for node in cs.added]).lower(),
 
                removed=u' '.join([safe_unicode(node.path) for node in cs.removed]).lower(),
 
                changed=u' '.join([safe_unicode(node.path) for node in cs.changed]).lower(),
 
                parents=u' '.join([cs.raw_id for cs in cs.parents]),
 
            )
 
            indexed += 1
 

	
 
        log.debug('indexed %d changesets for repo %s', indexed, repo_name)
kallithea/tests/functional/test_search_indexing.py
Show inline comments
 
@@ -117,41 +117,41 @@ class TestSearchControllerIndexing(TestC
 
        ('commit', 'this_should_be_unique_commit_log', 1),
 
        ('path', 'this_should_be_unique_filename.txt', 1),
 
    ])
 
    def test_repository_tokenization(self, reponame, searchtype, query, hit):
 
        self.log_user()
 

	
 
        q = 'repository:%s %s' % (reponame, query)
 
        response = self.app.get(url(controller='search', action='index'),
 
                                {'q': q, 'type': searchtype})
 
        response.mustcontain('>%d results' % hit)
 

	
 
    @parametrize('searchtype,query,hit', [
 
        ('content', 'this_should_be_unique_content', 2),
 
        ('content', 'this_should_be_unique_content', 1),
 
        ('commit', 'this_should_be_unique_commit_log', 1),
 
        ('path', 'this_should_be_unique_filename.txt', 2),
 
        ('path', 'this_should_be_unique_filename.txt', 1),
 
    ])
 
    def test_repository_case_sensitivity(self, searchtype, query, hit):
 
        self.log_user()
 

	
 
        lname = u'indexing_test-foo'
 
        uname = u'indexing_test-FOO'
 

	
 
        # (1) "repository:REPONAME" condition should match against
 
        # repositories case-insensitively
 
        q = 'repository:%s %s' % (lname, query)
 
        response = self.app.get(url(controller='search', action='index'),
 
                                {'q': q, 'type': searchtype})
 

	
 
        response.mustcontain('>%d results' % hit)
 
        response.mustcontain('>%d results' % (hit * 2))
 

	
 
        # (2) on the other hand, searching under the specific
 
        # repository should return results only for that repository,
 
        # even if specified name matches against another repository
 
        # case-insensitively.
 
        response = self.app.get(url(controller='search', action='index',
 
                                    repo_name=uname),
 
                                {'q': query, 'type': searchtype})
 

	
 
        response.mustcontain('>%d results' % hit)
 

	
 
        # confirm that there is no matching against lower name repository
0 comments (0 inline, 0 general)