Changeset - 8df1e9edd68f
[Not reviewed]
default
0 3 0
FUJIWARA Katsunori - 9 years ago 2017-01-30 11:09:45
foozy@lares.dti.ne.jp
indexers: use correct full repository name, which contains group name, at indexing

Before this revision, searching under the specific repository could
cause unexpected result, because repository names used for indexing didn't
contain the group name.

This issue was introduced by 8b7c0ef62427, which uses
repo.name_unicode as repository name instead of
safe_unicode(repo_name) to reduce unicode conversion cost while
repetition at indexing.

To use correct repository name at indexing, this revision replaces
repo.name_unicode by safe_unicode(repo_name). Reducing cost of repeated
unicode conversion cost while will (and should) be addressed in the
future.

This revision also adds a comment to BaseRepository.name property, to
avoid similar misunderstandings in the future.
3 files changed with 24 insertions and 2 deletions:
0 comments (0 inline, 0 general)
kallithea/lib/indexers/daemon.py
Show inline comments
 
@@ -110,232 +110,232 @@ class WhooshIndexingDaemon(object):
 
                     'index does not exist')
 
        else:
 
            self.initial = False
 

	
 
    def _get_index_revision(self, repo):
 
        db_repo = Repository.get_by_repo_name(repo.name_unicode)
 
        landing_rev = 'tip'
 
        if db_repo:
 
            _rev_type, _rev = db_repo.landing_rev
 
            landing_rev = _rev
 
        return landing_rev
 

	
 
    def _get_index_changeset(self, repo, index_rev=None):
 
        if not index_rev:
 
            index_rev = self._get_index_revision(repo)
 
        cs = repo.get_changeset(index_rev)
 
        return cs
 

	
 
    def get_paths(self, repo):
 
        """
 
        recursive walk in root dir and return a set of all path in that dir
 
        based on repository walk function
 
        """
 
        index_paths_ = set()
 
        try:
 
            cs = self._get_index_changeset(repo)
 
            for _topnode, _dirs, files in cs.walk('/'):
 
                for f in files:
 
                    index_paths_.add(os.path.join(safe_str(repo.path), safe_str(f.path)))
 

	
 
        except RepositoryError:
 
            log.debug(traceback.format_exc())
 
            pass
 
        return index_paths_
 

	
 
    def get_node(self, repo, path, index_rev=None):
 
        """
 
        gets a filenode based on given full path. It operates on string for
 
        hg git compatibility.
 

	
 
        :param repo: scm repo instance
 
        :param path: full path including root location
 
        :return: FileNode
 
        """
 
        # FIXME: paths should be normalized ... or even better: don't include repo.path
 
        path = safe_str(path)
 
        repo_path = safe_str(repo.path)
 
        assert path.startswith(repo_path)
 
        assert path[len(repo_path)] in (os.path.sep, os.path.altsep)
 
        node_path = path[len(repo_path) + 1:]
 
        cs = self._get_index_changeset(repo, index_rev=index_rev)
 
        node = cs.get_node(node_path)
 
        return node
 

	
 
    def is_indexable_node(self, node):
 
        """
 
        Just index the content of chosen files, skipping binary files
 
        """
 
        return (node.extension in INDEX_EXTENSIONS or node.name in INDEX_FILENAMES) and \
 
               not node.is_binary
 

	
 
    def get_node_mtime(self, node):
 
        return mktime(node.last_changeset.date.timetuple())
 

	
 
    def add_doc(self, writer, path, repo, repo_name, index_rev=None):
 
        """
 
        Adding doc to writer this function itself fetches data from
 
        the instance of vcs backend
 
        """
 
        try:
 
            node = self.get_node(repo, path, index_rev)
 
        except (ChangesetError, NodeDoesNotExistError):
 
            log.debug("couldn't add doc - %s did not have %r at %s", repo, path, index_rev)
 
            return 0, 0
 

	
 
        indexed = indexed_w_content = 0
 
        if self.is_indexable_node(node):
 
            u_content = node.content
 
            if not isinstance(u_content, unicode):
 
                log.warning('  >> %s Could not get this content as unicode '
 
                            'replacing with empty content' % path)
 
                u_content = u''
 
            else:
 
                log.debug('    >> %s [WITH CONTENT]', path)
 
                indexed_w_content += 1
 

	
 
        else:
 
            log.debug('    >> %s', path)
 
            # just index file name without it's content
 
            u_content = u''
 
            indexed += 1
 

	
 
        p = safe_unicode(path)
 
        writer.add_document(
 
            fileid=p,
 
            owner=unicode(repo.contact),
 
            repository_rawname=repo.name_unicode,
 
            repository_rawname=safe_unicode(repo_name),
 
            repository=safe_unicode(repo_name),
 
            path=p,
 
            content=u_content,
 
            modtime=self.get_node_mtime(node),
 
            extension=node.extension
 
        )
 
        return indexed, indexed_w_content
 

	
 
    def index_changesets(self, writer, repo_name, repo, start_rev=None):
 
        """
 
        Add all changeset in the vcs repo starting at start_rev
 
        to the index writer
 

	
 
        :param writer: the whoosh index writer to add to
 
        :param repo_name: name of the repository from whence the
 
          changeset originates including the repository group
 
        :param repo: the vcs repository instance to index changesets for,
 
          the presumption is the repo has changesets to index
 
        :param start_rev=None: the full sha id to start indexing from
 
          if start_rev is None then index from the first changeset in
 
          the repo
 
        """
 

	
 
        if start_rev is None:
 
            start_rev = repo[0].raw_id
 

	
 
        log.debug('indexing changesets in %s starting at rev: %s',
 
                  repo_name, start_rev)
 

	
 
        indexed = 0
 
        cs_iter = repo.get_changesets(start=start_rev)
 
        total = len(cs_iter)
 
        for cs in cs_iter:
 
            log.debug('    >> %s/%s', cs, total)
 
            writer.add_document(
 
                raw_id=unicode(cs.raw_id),
 
                owner=unicode(repo.contact),
 
                date=cs._timestamp,
 
                repository_rawname=repo.name_unicode,
 
                repository_rawname=safe_unicode(repo_name),
 
                repository=safe_unicode(repo_name),
 
                author=cs.author,
 
                message=cs.message,
 
                last=cs.last,
 
                added=u' '.join([safe_unicode(node.path) for node in cs.added]).lower(),
 
                removed=u' '.join([safe_unicode(node.path) for node in cs.removed]).lower(),
 
                changed=u' '.join([safe_unicode(node.path) for node in cs.changed]).lower(),
 
                parents=u' '.join([cs.raw_id for cs in cs.parents]),
 
            )
 
            indexed += 1
 

	
 
        log.debug('indexed %d changesets for repo %s', indexed, repo_name)
 
        return indexed
 

	
 
    def index_files(self, file_idx_writer, repo_name, repo):
 
        """
 
        Index files for given repo_name
 

	
 
        :param file_idx_writer: the whoosh index writer to add to
 
        :param repo_name: name of the repository we're indexing
 
        :param repo: instance of vcs repo
 
        """
 
        i_cnt = iwc_cnt = 0
 
        log.debug('building index for %s @revision:%s', repo.path,
 
                                                self._get_index_revision(repo))
 
        index_rev = self._get_index_revision(repo)
 
        for idx_path in self.get_paths(repo):
 
            i, iwc = self.add_doc(file_idx_writer, idx_path, repo, repo_name, index_rev)
 
            i_cnt += i
 
            iwc_cnt += iwc
 

	
 
        log.debug('added %s files %s with content for repo %s',
 
                  i_cnt + iwc_cnt, iwc_cnt, repo.path)
 
        return i_cnt, iwc_cnt
 

	
 
    def update_changeset_index(self):
 
        idx = open_dir(self.index_location, indexname=CHGSET_IDX_NAME)
 

	
 
        with idx.searcher() as searcher:
 
            writer = idx.writer()
 
            writer_is_dirty = False
 
            try:
 
                indexed_total = 0
 
                repo_name = None
 
                for repo_name, repo in self.repo_paths.items():
 
                    # skip indexing if there aren't any revs in the repo
 
                    num_of_revs = len(repo)
 
                    if num_of_revs < 1:
 
                        continue
 

	
 
                    qp = QueryParser('repository', schema=CHGSETS_SCHEMA)
 
                    q = qp.parse(u"last:t AND %s" % repo_name)
 

	
 
                    results = searcher.search(q)
 

	
 
                    # default to scanning the entire repo
 
                    last_rev = 0
 
                    start_id = None
 

	
 
                    if len(results) > 0:
 
                        # assuming that there is only one result, if not this
 
                        # may require a full re-index.
 
                        start_id = results[0]['raw_id']
 
                        last_rev = repo.get_changeset(revision=start_id).revision
 

	
 
                    # there are new changesets to index or a new repo to index
 
                    if last_rev == 0 or num_of_revs > last_rev + 1:
 
                        # delete the docs in the index for the previous
 
                        # last changeset(s)
 
                        for hit in results:
 
                            q = qp.parse(u"last:t AND %s AND raw_id:%s" %
 
                                            (repo_name, hit['raw_id']))
 
                            writer.delete_by_query(q)
 

	
 
                        # index from the previous last changeset + all new ones
 
                        indexed_total += self.index_changesets(writer,
 
                                                repo_name, repo, start_id)
 
                        writer_is_dirty = True
 
                log.debug('indexed %s changesets for repo %s',
 
                          indexed_total, repo_name
 
                )
 
            finally:
 
                if writer_is_dirty:
 
                    log.debug('>> COMMITING CHANGES TO CHANGESET INDEX<<')
 
                    writer.commit(merge=True)
 
                    log.debug('>>> FINISHED REBUILDING CHANGESET INDEX <<<')
 
                else:
 
                    log.debug('>> NOTHING TO COMMIT TO CHANGESET INDEX<<')
 

	
 
    def update_file_index(self):
 
        log.debug((u'STARTING INCREMENTAL INDEXING UPDATE FOR EXTENSIONS %s '
 
                   'AND REPOS %s') % (INDEX_EXTENSIONS, self.repo_paths.keys()))
 

	
 
        idx = open_dir(self.index_location, indexname=self.indexname)
 
        # The set of all paths in the index
 
        indexed_paths = set()
kallithea/lib/vcs/backends/base.py
Show inline comments
 
@@ -4,192 +4,195 @@
 
    ~~~~~~~~~~~~~~~~~
 

	
 
    Base for all available scm backends
 

	
 
    :created_on: Apr 8, 2010
 
    :copyright: (c) 2010-2011 by Marcin Kuzminski, Lukasz Balcerzak.
 
"""
 

	
 
import datetime
 
import itertools
 

	
 
from kallithea.lib.vcs.utils import author_name, author_email, safe_unicode
 
from kallithea.lib.vcs.utils.lazy import LazyProperty
 
from kallithea.lib.vcs.utils.helpers import get_dict_for_attrs
 
from kallithea.lib.vcs.conf import settings
 

	
 
from kallithea.lib.vcs.exceptions import (
 
    ChangesetError, EmptyRepositoryError, NodeAlreadyAddedError,
 
    NodeAlreadyChangedError, NodeAlreadyExistsError, NodeAlreadyRemovedError,
 
    NodeDoesNotExistError, NodeNotChangedError, RepositoryError
 
)
 

	
 

	
 
class BaseRepository(object):
 
    """
 
    Base Repository for final backends
 

	
 
    **Attributes**
 

	
 
        ``DEFAULT_BRANCH_NAME``
 
            name of default branch (i.e. "trunk" for svn, "master" for git etc.
 

	
 
        ``scm``
 
            alias of scm, i.e. *git* or *hg*
 

	
 
        ``repo``
 
            object from external api
 

	
 
        ``revisions``
 
            list of all available revisions' ids, in ascending order
 

	
 
        ``changesets``
 
            storage dict caching returned changesets
 

	
 
        ``path``
 
            absolute path to the repository
 

	
 
        ``branches``
 
            branches as list of changesets
 

	
 
        ``tags``
 
            tags as list of changesets
 
    """
 
    scm = None
 
    DEFAULT_BRANCH_NAME = None
 
    EMPTY_CHANGESET = '0' * 40
 

	
 
    def __init__(self, repo_path, create=False, **kwargs):
 
        """
 
        Initializes repository. Raises RepositoryError if repository could
 
        not be find at the given ``repo_path`` or directory at ``repo_path``
 
        exists and ``create`` is set to True.
 

	
 
        :param repo_path: local path of the repository
 
        :param create=False: if set to True, would try to create repository.
 
        :param src_url=None: if set, should be proper url from which repository
 
          would be cloned; requires ``create`` parameter to be set to True -
 
          raises RepositoryError if src_url is set and create evaluates to
 
          False
 
        """
 
        raise NotImplementedError
 

	
 
    def __str__(self):
 
        return '<%s at %s>' % (self.__class__.__name__, self.path)
 

	
 
    def __repr__(self):
 
        return self.__str__()
 

	
 
    def __len__(self):
 
        return self.count()
 

	
 
    def __eq__(self, other):
 
        same_instance = isinstance(other, self.__class__)
 
        return same_instance and getattr(other, 'path', None) == self.path
 

	
 
    def __ne__(self, other):
 
        return not self.__eq__(other)
 

	
 
    @LazyProperty
 
    def alias(self):
 
        for k, v in settings.BACKENDS.items():
 
            if v.split('.')[-1] == str(self.__class__.__name__):
 
                return k
 

	
 
    @LazyProperty
 
    def name(self):
 
        """
 
        Return repository name (without group name)
 
        """
 
        raise NotImplementedError
 

	
 
    @property
 
    def name_unicode(self):
 
        return safe_unicode(self.name)
 

	
 
    @LazyProperty
 
    def owner(self):
 
        raise NotImplementedError
 

	
 
    @LazyProperty
 
    def description(self):
 
        raise NotImplementedError
 

	
 
    @LazyProperty
 
    def size(self):
 
        """
 
        Returns combined size in bytes for all repository files
 
        """
 

	
 
        size = 0
 
        try:
 
            tip = self.get_changeset()
 
            for topnode, dirs, files in tip.walk('/'):
 
                for f in files:
 
                    size += tip.get_file_size(f.path)
 

	
 
        except RepositoryError as e:
 
            pass
 
        return size
 

	
 
    def is_valid(self):
 
        """
 
        Validates repository.
 
        """
 
        raise NotImplementedError
 

	
 
    def is_empty(self):
 
        return self._empty
 

	
 
    #==========================================================================
 
    # CHANGESETS
 
    #==========================================================================
 

	
 
    def get_changeset(self, revision=None):
 
        """
 
        Returns instance of ``Changeset`` class. If ``revision`` is None, most
 
        recent changeset is returned.
 

	
 
        :raises ``EmptyRepositoryError``: if there are no revisions
 
        """
 
        raise NotImplementedError
 

	
 
    def __iter__(self):
 
        """
 
        Allows Repository objects to be iterated.
 

	
 
        *Requires* implementation of ``__getitem__`` method.
 
        """
 
        for revision in self.revisions:
 
            yield self.get_changeset(revision)
 

	
 
    def get_changesets(self, start=None, end=None, start_date=None,
 
                       end_date=None, branch_name=None, reverse=False):
 
        """
 
        Returns iterator of ``MercurialChangeset`` objects from start to end
 
        not inclusive This should behave just like a list, ie. end is not
 
        inclusive
 

	
 
        :param start: None or str
 
        :param end: None or str
 
        :param start_date:
 
        :param end_date:
 
        :param branch_name:
 
        :param reversed:
 
        """
 
        raise NotImplementedError
 

	
 
    def __getslice__(self, i, j):
 
        """
 
        Returns a iterator of sliced repository
 
        """
 
        for rev in self.revisions[i:j]:
 
            yield self.get_changeset(rev)
 

	
 
    def __getitem__(self, key):
 
        return self.get_changeset(key)
 

	
 
    def count(self):
 
        return len(self.revisions)
 

	
 
    def tag(self, name, user, revision=None, message=None, date=None, **opts):
 
        """
 
        Creates and returns a tag for the given ``revision``.
 

	
 
        :param name: name for new tag
kallithea/tests/functional/test_search_indexing.py
Show inline comments
 
@@ -32,167 +32,186 @@ def init_stopword_test(repo):
 
                                 author='this is it <this-is-it@foo.bar.com>',
 
                                 vcs_type='hg',
 
                                 parent=prev,
 
                                 newfile=True)
 

	
 
repos = [
 
    # reponame,              init func or fork base, groupname
 
    (u'indexing_test',       init_indexing_test,     None),
 
    (u'indexing_test-fork',  u'indexing_test',       None),
 
    (u'group/indexing_test', u'indexing_test',       u'group'),
 
    (u'this-is-it',          u'indexing_test',       None),
 
    (u'indexing_test-foo',   u'indexing_test',       None),
 
    (u'indexing_test-FOO',   u'indexing_test',       None),
 
    (u'stopword_test',       init_stopword_test,     None),
 
]
 

	
 
# map: name => id
 
repoids = {}
 
groupids = {}
 

	
 
def rebuild_index(full_index):
 
    with mock.patch('kallithea.lib.indexers.daemon.log.debug',
 
                    lambda *args, **kwargs: None):
 
        # The more revisions managed repositories have, the more
 
        # memory capturing "log.debug()" output in "indexers.daemon"
 
        # requires. This may cause unintentional failure of subsequent
 
        # tests, if ENOMEM at forking "git" prevents from rebuilding
 
        # index for search.
 
        # Therefore, "log.debug()" is disabled regardless of logging
 
        # level while rebuilding index.
 
        # (FYI, ENOMEM occurs at forking "git" with python 2.7.3,
 
        # Linux 3.2.78-1 x86_64, 3GB memory, and no ulimit
 
        # configuration for memory)
 
        create_test_index(TESTS_TMP_PATH, CONFIG, full_index=full_index)
 

	
 

	
 
class TestSearchControllerIndexing(TestController):
 
    @classmethod
 
    def setup_class(cls):
 
        for reponame, init_or_fork, groupname in repos:
 
            if groupname and groupname not in groupids:
 
                group = fixture.create_repo_group(groupname)
 
                groupids[groupname] = group.group_id
 
            if callable(init_or_fork):
 
                repo = fixture.create_repo(reponame,
 
                                           repo_group=groupname)
 
                init_or_fork(repo)
 
            else:
 
                repo = fixture.create_fork(init_or_fork, reponame,
 
                                           repo_group=groupname)
 
            repoids[reponame] = repo.repo_id
 

	
 
        # treat "it" as indexable filename
 
        filenames_mock = list(INDEX_FILENAMES)
 
        filenames_mock.append('it')
 
        with mock.patch('kallithea.lib.indexers.daemon.INDEX_FILENAMES',
 
                        filenames_mock):
 
            rebuild_index(full_index=False) # only for newly added repos
 

	
 
    @classmethod
 
    def teardown_class(cls):
 
        # delete in reversed order, to delete fork destination at first
 
        for reponame, init_or_fork, groupname in reversed(repos):
 
            RepoModel().delete(repoids[reponame])
 

	
 
        for reponame, init_or_fork, groupname in reversed(repos):
 
            if groupname in groupids:
 
                RepoGroupModel().delete(groupids.pop(groupname),
 
                                        force_delete=True)
 

	
 
        Session().commit()
 
        Session.remove()
 

	
 
        rebuild_index(full_index=True) # rebuild fully for subsequent tests
 

	
 
    @parametrize('reponame', [
 
        (u'indexing_test'),
 
        (u'indexing_test-fork'),
 
        (u'group/indexing_test'),
 
        (u'this-is-it'),
 
        (u'*-fork'),
 
        (u'group/*'),
 
    ])
 
    @parametrize('searchtype,query,hit', [
 
        ('content', 'this_should_be_unique_content', 1),
 
        ('commit', 'this_should_be_unique_commit_log', 1),
 
        ('path', 'this_should_be_unique_filename.txt', 1),
 
    ])
 
    def test_repository_tokenization(self, reponame, searchtype, query, hit):
 
        self.log_user()
 

	
 
        q = 'repository:%s %s' % (reponame, query)
 
        response = self.app.get(url(controller='search', action='index'),
 
                                {'q': q, 'type': searchtype})
 
        response.mustcontain('>%d results' % hit)
 

	
 
    @parametrize('reponame', [
 
        (u'indexing_test'),
 
        (u'indexing_test-fork'),
 
        (u'group/indexing_test'),
 
        (u'this-is-it'),
 
    ])
 
    @parametrize('searchtype,query,hit', [
 
        ('content', 'this_should_be_unique_content', 1),
 
        ('commit', 'this_should_be_unique_commit_log', 1),
 
        ('path', 'this_should_be_unique_filename.txt', 1),
 
    ])
 
    def test_searching_under_repository(self, reponame, searchtype, query, hit):
 
        self.log_user()
 

	
 
        response = self.app.get(url(controller='search', action='index',
 
                                    repo_name=reponame),
 
                                {'q': query, 'type': searchtype})
 
        response.mustcontain('>%d results' % hit)
 

	
 
    @parametrize('searchtype,query,hit', [
 
        ('content', 'this_should_be_unique_content', 1),
 
        ('commit', 'this_should_be_unique_commit_log', 1),
 
        ('path', 'this_should_be_unique_filename.txt', 1),
 
    ])
 
    def test_repository_case_sensitivity(self, searchtype, query, hit):
 
        self.log_user()
 

	
 
        lname = u'indexing_test-foo'
 
        uname = u'indexing_test-FOO'
 

	
 
        # (1) "repository:REPONAME" condition should match against
 
        # repositories case-insensitively
 
        q = 'repository:%s %s' % (lname, query)
 
        response = self.app.get(url(controller='search', action='index'),
 
                                {'q': q, 'type': searchtype})
 

	
 
        response.mustcontain('>%d results' % (hit * 2))
 

	
 
        # (2) on the other hand, searching under the specific
 
        # repository should return results only for that repository,
 
        # even if specified name matches against another repository
 
        # case-insensitively.
 
        response = self.app.get(url(controller='search', action='index',
 
                                    repo_name=uname),
 
                                {'q': query, 'type': searchtype})
 

	
 
        response.mustcontain('>%d results' % hit)
 

	
 
        # confirm that there is no matching against lower name repository
 
        assert uname in response
 
        assert lname not in response
 

	
 
    @parametrize('searchtype,query,hit', [
 
        ('content', 'path:this/is/it def test', 1),
 
        ('commit', 'added:this/is/it bother to ask where', 1),
 
        # this condition matches against files below, because
 
        # "path:" condition is also applied on "repository path".
 
        # - "this/is/it" in "stopword_test" repo
 
        # - "this_should_be_unique_filename.txt" in "this-is-it" repo
 
        ('path', 'this/is/it', 2),
 

	
 
        ('content', 'extension:us', 1),
 
        ('path', 'extension:us', 1),
 
    ])
 
    def test_filename_stopword(self, searchtype, query, hit):
 
        response = self.app.get(url(controller='search', action='index'),
 
                                {'q': query, 'type': searchtype})
 

	
 
        response.mustcontain('>%d results' % hit)
 

	
 
    @parametrize('searchtype,query,hit', [
 
        # matching against both 2 files
 
        ('content', 'owner:"this is it"', 0),
 
        ('content', 'owner:this-is-it', 0),
 
        ('path', 'owner:"this is it"', 0),
 
        ('path', 'owner:this-is-it', 0),
 

	
 
        # matching against both 2 revisions
 
        ('commit', 'owner:"this is it"', 0),
 
        ('commit', 'owner:"this-is-it"', 0),
 

	
 
        # matching against only 1 revision
 
        ('commit', 'author:"this is it"', 1),
 
        ('commit', 'author:"this-is-it"', 1),
 
    ])
 
    def test_mailaddr_stopword(self, searchtype, query, hit):
 
        response = self.app.get(url(controller='search', action='index'),
 
                                {'q': query, 'type': searchtype})
 

	
 
        response.mustcontain('>%d results' % hit)
0 comments (0 inline, 0 general)