Changeset - 8df1e9edd68f
[Not reviewed]
default
0 3 0
FUJIWARA Katsunori - 9 years ago 2017-01-30 11:09:45
foozy@lares.dti.ne.jp
indexers: use correct full repository name, which contains group name, at indexing

Before this revision, searching under the specific repository could
cause unexpected result, because repository names used for indexing didn't
contain the group name.

This issue was introduced by 8b7c0ef62427, which uses
repo.name_unicode as repository name instead of
safe_unicode(repo_name) to reduce unicode conversion cost while
repetition at indexing.

To use correct repository name at indexing, this revision replaces
repo.name_unicode by safe_unicode(repo_name). Reducing cost of repeated
unicode conversion cost while will (and should) be addressed in the
future.

This revision also adds a comment to BaseRepository.name property, to
avoid similar misunderstandings in the future.
3 files changed with 24 insertions and 2 deletions:
0 comments (0 inline, 0 general)
kallithea/lib/indexers/daemon.py
Show inline comments
 
@@ -158,136 +158,136 @@ class WhooshIndexingDaemon(object):
 
        assert path[len(repo_path)] in (os.path.sep, os.path.altsep)
 
        node_path = path[len(repo_path) + 1:]
 
        cs = self._get_index_changeset(repo, index_rev=index_rev)
 
        node = cs.get_node(node_path)
 
        return node
 

	
 
    def is_indexable_node(self, node):
 
        """
 
        Just index the content of chosen files, skipping binary files
 
        """
 
        return (node.extension in INDEX_EXTENSIONS or node.name in INDEX_FILENAMES) and \
 
               not node.is_binary
 

	
 
    def get_node_mtime(self, node):
 
        return mktime(node.last_changeset.date.timetuple())
 

	
 
    def add_doc(self, writer, path, repo, repo_name, index_rev=None):
 
        """
 
        Adding doc to writer this function itself fetches data from
 
        the instance of vcs backend
 
        """
 
        try:
 
            node = self.get_node(repo, path, index_rev)
 
        except (ChangesetError, NodeDoesNotExistError):
 
            log.debug("couldn't add doc - %s did not have %r at %s", repo, path, index_rev)
 
            return 0, 0
 

	
 
        indexed = indexed_w_content = 0
 
        if self.is_indexable_node(node):
 
            u_content = node.content
 
            if not isinstance(u_content, unicode):
 
                log.warning('  >> %s Could not get this content as unicode '
 
                            'replacing with empty content' % path)
 
                u_content = u''
 
            else:
 
                log.debug('    >> %s [WITH CONTENT]', path)
 
                indexed_w_content += 1
 

	
 
        else:
 
            log.debug('    >> %s', path)
 
            # just index file name without it's content
 
            u_content = u''
 
            indexed += 1
 

	
 
        p = safe_unicode(path)
 
        writer.add_document(
 
            fileid=p,
 
            owner=unicode(repo.contact),
 
            repository_rawname=repo.name_unicode,
 
            repository_rawname=safe_unicode(repo_name),
 
            repository=safe_unicode(repo_name),
 
            path=p,
 
            content=u_content,
 
            modtime=self.get_node_mtime(node),
 
            extension=node.extension
 
        )
 
        return indexed, indexed_w_content
 

	
 
    def index_changesets(self, writer, repo_name, repo, start_rev=None):
 
        """
 
        Add all changeset in the vcs repo starting at start_rev
 
        to the index writer
 

	
 
        :param writer: the whoosh index writer to add to
 
        :param repo_name: name of the repository from whence the
 
          changeset originates including the repository group
 
        :param repo: the vcs repository instance to index changesets for,
 
          the presumption is the repo has changesets to index
 
        :param start_rev=None: the full sha id to start indexing from
 
          if start_rev is None then index from the first changeset in
 
          the repo
 
        """
 

	
 
        if start_rev is None:
 
            start_rev = repo[0].raw_id
 

	
 
        log.debug('indexing changesets in %s starting at rev: %s',
 
                  repo_name, start_rev)
 

	
 
        indexed = 0
 
        cs_iter = repo.get_changesets(start=start_rev)
 
        total = len(cs_iter)
 
        for cs in cs_iter:
 
            log.debug('    >> %s/%s', cs, total)
 
            writer.add_document(
 
                raw_id=unicode(cs.raw_id),
 
                owner=unicode(repo.contact),
 
                date=cs._timestamp,
 
                repository_rawname=repo.name_unicode,
 
                repository_rawname=safe_unicode(repo_name),
 
                repository=safe_unicode(repo_name),
 
                author=cs.author,
 
                message=cs.message,
 
                last=cs.last,
 
                added=u' '.join([safe_unicode(node.path) for node in cs.added]).lower(),
 
                removed=u' '.join([safe_unicode(node.path) for node in cs.removed]).lower(),
 
                changed=u' '.join([safe_unicode(node.path) for node in cs.changed]).lower(),
 
                parents=u' '.join([cs.raw_id for cs in cs.parents]),
 
            )
 
            indexed += 1
 

	
 
        log.debug('indexed %d changesets for repo %s', indexed, repo_name)
 
        return indexed
 

	
 
    def index_files(self, file_idx_writer, repo_name, repo):
 
        """
 
        Index files for given repo_name
 

	
 
        :param file_idx_writer: the whoosh index writer to add to
 
        :param repo_name: name of the repository we're indexing
 
        :param repo: instance of vcs repo
 
        """
 
        i_cnt = iwc_cnt = 0
 
        log.debug('building index for %s @revision:%s', repo.path,
 
                                                self._get_index_revision(repo))
 
        index_rev = self._get_index_revision(repo)
 
        for idx_path in self.get_paths(repo):
 
            i, iwc = self.add_doc(file_idx_writer, idx_path, repo, repo_name, index_rev)
 
            i_cnt += i
 
            iwc_cnt += iwc
 

	
 
        log.debug('added %s files %s with content for repo %s',
 
                  i_cnt + iwc_cnt, iwc_cnt, repo.path)
 
        return i_cnt, iwc_cnt
 

	
 
    def update_changeset_index(self):
 
        idx = open_dir(self.index_location, indexname=CHGSET_IDX_NAME)
 

	
 
        with idx.searcher() as searcher:
 
            writer = idx.writer()
 
            writer_is_dirty = False
 
            try:
 
                indexed_total = 0
 
                repo_name = None
 
                for repo_name, repo in self.repo_paths.items():
 
                    # skip indexing if there aren't any revs in the repo
 
                    num_of_revs = len(repo)
 
                    if num_of_revs < 1:
kallithea/lib/vcs/backends/base.py
Show inline comments
 
@@ -52,96 +52,99 @@ class BaseRepository(object):
 
            branches as list of changesets
 

	
 
        ``tags``
 
            tags as list of changesets
 
    """
 
    scm = None
 
    DEFAULT_BRANCH_NAME = None
 
    EMPTY_CHANGESET = '0' * 40
 

	
 
    def __init__(self, repo_path, create=False, **kwargs):
 
        """
 
        Initializes repository. Raises RepositoryError if repository could
 
        not be find at the given ``repo_path`` or directory at ``repo_path``
 
        exists and ``create`` is set to True.
 

	
 
        :param repo_path: local path of the repository
 
        :param create=False: if set to True, would try to create repository.
 
        :param src_url=None: if set, should be proper url from which repository
 
          would be cloned; requires ``create`` parameter to be set to True -
 
          raises RepositoryError if src_url is set and create evaluates to
 
          False
 
        """
 
        raise NotImplementedError
 

	
 
    def __str__(self):
 
        return '<%s at %s>' % (self.__class__.__name__, self.path)
 

	
 
    def __repr__(self):
 
        return self.__str__()
 

	
 
    def __len__(self):
 
        return self.count()
 

	
 
    def __eq__(self, other):
 
        same_instance = isinstance(other, self.__class__)
 
        return same_instance and getattr(other, 'path', None) == self.path
 

	
 
    def __ne__(self, other):
 
        return not self.__eq__(other)
 

	
 
    @LazyProperty
 
    def alias(self):
 
        for k, v in settings.BACKENDS.items():
 
            if v.split('.')[-1] == str(self.__class__.__name__):
 
                return k
 

	
 
    @LazyProperty
 
    def name(self):
 
        """
 
        Return repository name (without group name)
 
        """
 
        raise NotImplementedError
 

	
 
    @property
 
    def name_unicode(self):
 
        return safe_unicode(self.name)
 

	
 
    @LazyProperty
 
    def owner(self):
 
        raise NotImplementedError
 

	
 
    @LazyProperty
 
    def description(self):
 
        raise NotImplementedError
 

	
 
    @LazyProperty
 
    def size(self):
 
        """
 
        Returns combined size in bytes for all repository files
 
        """
 

	
 
        size = 0
 
        try:
 
            tip = self.get_changeset()
 
            for topnode, dirs, files in tip.walk('/'):
 
                for f in files:
 
                    size += tip.get_file_size(f.path)
 

	
 
        except RepositoryError as e:
 
            pass
 
        return size
 

	
 
    def is_valid(self):
 
        """
 
        Validates repository.
 
        """
 
        raise NotImplementedError
 

	
 
    def is_empty(self):
 
        return self._empty
 

	
 
    #==========================================================================
 
    # CHANGESETS
 
    #==========================================================================
 

	
 
    def get_changeset(self, revision=None):
 
        """
 
        Returns instance of ``Changeset`` class. If ``revision`` is None, most
 
        recent changeset is returned.
kallithea/tests/functional/test_search_indexing.py
Show inline comments
 
@@ -80,96 +80,115 @@ class TestSearchControllerIndexing(TestC
 
                repo = fixture.create_fork(init_or_fork, reponame,
 
                                           repo_group=groupname)
 
            repoids[reponame] = repo.repo_id
 

	
 
        # treat "it" as indexable filename
 
        filenames_mock = list(INDEX_FILENAMES)
 
        filenames_mock.append('it')
 
        with mock.patch('kallithea.lib.indexers.daemon.INDEX_FILENAMES',
 
                        filenames_mock):
 
            rebuild_index(full_index=False) # only for newly added repos
 

	
 
    @classmethod
 
    def teardown_class(cls):
 
        # delete in reversed order, to delete fork destination at first
 
        for reponame, init_or_fork, groupname in reversed(repos):
 
            RepoModel().delete(repoids[reponame])
 

	
 
        for reponame, init_or_fork, groupname in reversed(repos):
 
            if groupname in groupids:
 
                RepoGroupModel().delete(groupids.pop(groupname),
 
                                        force_delete=True)
 

	
 
        Session().commit()
 
        Session.remove()
 

	
 
        rebuild_index(full_index=True) # rebuild fully for subsequent tests
 

	
 
    @parametrize('reponame', [
 
        (u'indexing_test'),
 
        (u'indexing_test-fork'),
 
        (u'group/indexing_test'),
 
        (u'this-is-it'),
 
        (u'*-fork'),
 
        (u'group/*'),
 
    ])
 
    @parametrize('searchtype,query,hit', [
 
        ('content', 'this_should_be_unique_content', 1),
 
        ('commit', 'this_should_be_unique_commit_log', 1),
 
        ('path', 'this_should_be_unique_filename.txt', 1),
 
    ])
 
    def test_repository_tokenization(self, reponame, searchtype, query, hit):
 
        self.log_user()
 

	
 
        q = 'repository:%s %s' % (reponame, query)
 
        response = self.app.get(url(controller='search', action='index'),
 
                                {'q': q, 'type': searchtype})
 
        response.mustcontain('>%d results' % hit)
 

	
 
    @parametrize('reponame', [
 
        (u'indexing_test'),
 
        (u'indexing_test-fork'),
 
        (u'group/indexing_test'),
 
        (u'this-is-it'),
 
    ])
 
    @parametrize('searchtype,query,hit', [
 
        ('content', 'this_should_be_unique_content', 1),
 
        ('commit', 'this_should_be_unique_commit_log', 1),
 
        ('path', 'this_should_be_unique_filename.txt', 1),
 
    ])
 
    def test_searching_under_repository(self, reponame, searchtype, query, hit):
 
        self.log_user()
 

	
 
        response = self.app.get(url(controller='search', action='index',
 
                                    repo_name=reponame),
 
                                {'q': query, 'type': searchtype})
 
        response.mustcontain('>%d results' % hit)
 

	
 
    @parametrize('searchtype,query,hit', [
 
        ('content', 'this_should_be_unique_content', 1),
 
        ('commit', 'this_should_be_unique_commit_log', 1),
 
        ('path', 'this_should_be_unique_filename.txt', 1),
 
    ])
 
    def test_repository_case_sensitivity(self, searchtype, query, hit):
 
        self.log_user()
 

	
 
        lname = u'indexing_test-foo'
 
        uname = u'indexing_test-FOO'
 

	
 
        # (1) "repository:REPONAME" condition should match against
 
        # repositories case-insensitively
 
        q = 'repository:%s %s' % (lname, query)
 
        response = self.app.get(url(controller='search', action='index'),
 
                                {'q': q, 'type': searchtype})
 

	
 
        response.mustcontain('>%d results' % (hit * 2))
 

	
 
        # (2) on the other hand, searching under the specific
 
        # repository should return results only for that repository,
 
        # even if specified name matches against another repository
 
        # case-insensitively.
 
        response = self.app.get(url(controller='search', action='index',
 
                                    repo_name=uname),
 
                                {'q': query, 'type': searchtype})
 

	
 
        response.mustcontain('>%d results' % hit)
 

	
 
        # confirm that there is no matching against lower name repository
 
        assert uname in response
 
        assert lname not in response
 

	
 
    @parametrize('searchtype,query,hit', [
 
        ('content', 'path:this/is/it def test', 1),
 
        ('commit', 'added:this/is/it bother to ask where', 1),
 
        # this condition matches against files below, because
 
        # "path:" condition is also applied on "repository path".
 
        # - "this/is/it" in "stopword_test" repo
 
        # - "this_should_be_unique_filename.txt" in "this-is-it" repo
 
        ('path', 'this/is/it', 2),
 

	
 
        ('content', 'extension:us', 1),
 
        ('path', 'extension:us', 1),
 
    ])
 
    def test_filename_stopword(self, searchtype, query, hit):
 
        response = self.app.get(url(controller='search', action='index'),
 
                                {'q': query, 'type': searchtype})
0 comments (0 inline, 0 general)