Changeset - 20699dd652ff
[Not reviewed]
stable
0 1 0
Mads Kiilerich - 10 years ago 2015-12-25 12:32:47
madski@unity3d.com
indexer: skip documents that can't be retrieved - probably because encoding issues (Issue #175)

This doesn't fix the encoding issue but it makes it less fatal.
1 file changed with 5 insertions and 1 deletions:
0 comments (0 inline, 0 general)
kallithea/lib/indexers/daemon.py
Show inline comments
 
@@ -77,194 +77,198 @@ class WhooshIndexingDaemon(object):
 
            raise Exception('You have to provide repositories location')
 

	
 
        self.repo_paths = ScmModel(sa).repo_scan(self.repo_location)
 

	
 
        #filter repo list
 
        if repo_list:
 
            #Fix non-ascii repo names to unicode
 
            repo_list = map(safe_unicode, repo_list)
 
            self.filtered_repo_paths = {}
 
            for repo_name, repo in self.repo_paths.items():
 
                if repo_name in repo_list:
 
                    self.filtered_repo_paths[repo_name] = repo
 

	
 
            self.repo_paths = self.filtered_repo_paths
 

	
 
        #filter update repo list
 
        self.filtered_repo_update_paths = {}
 
        if repo_update_list:
 
            self.filtered_repo_update_paths = {}
 
            for repo_name, repo in self.repo_paths.items():
 
                if repo_name in repo_update_list:
 
                    self.filtered_repo_update_paths[repo_name] = repo
 
            self.repo_paths = self.filtered_repo_update_paths
 

	
 
        self.initial = True
 
        if not os.path.isdir(self.index_location):
 
            os.makedirs(self.index_location)
 
            log.info('Cannot run incremental index since it does not '
 
                     'yet exist running full build')
 
        elif not exists_in(self.index_location, IDX_NAME):
 
            log.info('Running full index build as the file content '
 
                     'index does not exist')
 
        elif not exists_in(self.index_location, CHGSET_IDX_NAME):
 
            log.info('Running full index build as the changeset '
 
                     'index does not exist')
 
        else:
 
            self.initial = False
 

	
 
    def _get_index_revision(self, repo):
 
        db_repo = Repository.get_by_repo_name(repo.name_unicode)
 
        landing_rev = 'tip'
 
        if db_repo:
 
            _rev_type, _rev = db_repo.landing_rev
 
            landing_rev = _rev
 
        return landing_rev
 

	
 
    def _get_index_changeset(self, repo, index_rev=None):
 
        if not index_rev:
 
            index_rev = self._get_index_revision(repo)
 
        cs = repo.get_changeset(index_rev)
 
        return cs
 

	
 
    def get_paths(self, repo):
 
        """
 
        recursive walk in root dir and return a set of all path in that dir
 
        based on repository walk function
 
        """
 
        index_paths_ = set()
 
        try:
 
            cs = self._get_index_changeset(repo)
 
            for _topnode, _dirs, files in cs.walk('/'):
 
                for f in files:
 
                    index_paths_.add(jn(safe_str(repo.path), safe_str(f.path)))
 

	
 
        except RepositoryError:
 
            log.debug(traceback.format_exc())
 
            pass
 
        return index_paths_
 

	
 
    def get_node(self, repo, path, index_rev=None):
 
        """
 
        gets a filenode based on given full path. It operates on string for
 
        hg git compatibility.
 

	
 
        :param repo: scm repo instance
 
        :param path: full path including root location
 
        :return: FileNode
 
        """
 
        # FIXME: paths should be normalized ... or even better: don't include repo.path
 
        path = safe_str(path)
 
        repo_path = safe_str(repo.path)
 
        assert path.startswith(repo_path)
 
        assert path[len(repo_path)] in (os.path.sep, os.path.altsep)
 
        node_path = path[len(repo_path) + 1:]
 
        cs = self._get_index_changeset(repo, index_rev=index_rev)
 
        node = cs.get_node(node_path)
 
        return node
 

	
 
    def get_node_mtime(self, node):
 
        return mktime(node.last_changeset.date.timetuple())
 

	
 
    def add_doc(self, writer, path, repo, repo_name, index_rev=None):
 
        """
 
        Adding doc to writer this function itself fetches data from
 
        the instance of vcs backend
 
        """
 
        try:
 
            node = self.get_node(repo, path, index_rev)
 
        except (ChangesetError, NodeDoesNotExistError):
 
            log.debug("couldn't add doc - %s did not have %r at %s", repo, path, index_rev)
 
            return 0, 0
 

	
 
        node = self.get_node(repo, path, index_rev)
 
        indexed = indexed_w_content = 0
 
        # we just index the content of chosen files, and skip binary files
 
        if node.extension in INDEX_EXTENSIONS and not node.is_binary:
 
            u_content = node.content
 
            if not isinstance(u_content, unicode):
 
                log.warning('  >> %s Could not get this content as unicode '
 
                            'replacing with empty content' % path)
 
                u_content = u''
 
            else:
 
                log.debug('    >> %s [WITH CONTENT]', path)
 
                indexed_w_content += 1
 

	
 
        else:
 
            log.debug('    >> %s', path)
 
            # just index file name without it's content
 
            u_content = u''
 
            indexed += 1
 

	
 
        p = safe_unicode(path)
 
        writer.add_document(
 
            fileid=p,
 
            owner=unicode(repo.contact),
 
            repository=safe_unicode(repo_name),
 
            path=p,
 
            content=u_content,
 
            modtime=self.get_node_mtime(node),
 
            extension=node.extension
 
        )
 
        return indexed, indexed_w_content
 

	
 
    def index_changesets(self, writer, repo_name, repo, start_rev=None):
 
        """
 
        Add all changeset in the vcs repo starting at start_rev
 
        to the index writer
 

	
 
        :param writer: the whoosh index writer to add to
 
        :param repo_name: name of the repository from whence the
 
          changeset originates including the repository group
 
        :param repo: the vcs repository instance to index changesets for,
 
          the presumption is the repo has changesets to index
 
        :param start_rev=None: the full sha id to start indexing from
 
          if start_rev is None then index from the first changeset in
 
          the repo
 
        """
 

	
 
        if start_rev is None:
 
            start_rev = repo[0].raw_id
 

	
 
        log.debug('indexing changesets in %s starting at rev: %s',
 
                  repo_name, start_rev)
 

	
 
        indexed = 0
 
        cs_iter = repo.get_changesets(start=start_rev)
 
        total = len(cs_iter)
 
        for cs in cs_iter:
 
            log.debug('    >> %s/%s', cs, total)
 
            writer.add_document(
 
                raw_id=unicode(cs.raw_id),
 
                owner=unicode(repo.contact),
 
                date=cs._timestamp,
 
                repository=safe_unicode(repo_name),
 
                author=cs.author,
 
                message=cs.message,
 
                last=cs.last,
 
                added=u' '.join([safe_unicode(node.path) for node in cs.added]).lower(),
 
                removed=u' '.join([safe_unicode(node.path) for node in cs.removed]).lower(),
 
                changed=u' '.join([safe_unicode(node.path) for node in cs.changed]).lower(),
 
                parents=u' '.join([cs.raw_id for cs in cs.parents]),
 
            )
 
            indexed += 1
 

	
 
        log.debug('indexed %d changesets for repo %s', indexed, repo_name)
 
        return indexed
 

	
 
    def index_files(self, file_idx_writer, repo_name, repo):
 
        """
 
        Index files for given repo_name
 

	
 
        :param file_idx_writer: the whoosh index writer to add to
 
        :param repo_name: name of the repository we're indexing
 
        :param repo: instance of vcs repo
 
        """
 
        i_cnt = iwc_cnt = 0
 
        log.debug('building index for %s @revision:%s', repo.path,
 
                                                self._get_index_revision(repo))
 
        index_rev = self._get_index_revision(repo)
 
        for idx_path in self.get_paths(repo):
 
            i, iwc = self.add_doc(file_idx_writer, idx_path, repo, repo_name, index_rev)
 
            i_cnt += i
 
            iwc_cnt += iwc
 

	
 
        log.debug('added %s files %s with content for repo %s',
 
                  i_cnt + iwc_cnt, iwc_cnt, repo.path)
 
        return i_cnt, iwc_cnt
 

	
 
    def update_changeset_index(self):
0 comments (0 inline, 0 general)