Changeset - b5419cd0ac40
[Not reviewed]
default
0 1 0
Mads Kiilerich - 7 years ago 2018-12-23 21:16:07
mads@kiilerich.com
search: tweak logging from index-create

Try to make it slightly more readable.
1 file changed with 17 insertions and 17 deletions:
0 comments (0 inline, 0 general)
kallithea/lib/indexers/daemon.py
Show inline comments
 
@@ -92,30 +92,30 @@ class WhooshIndexingDaemon(object):
 
        self.filtered_repo_update_paths = {}
 
        if repo_update_list:
 
            self.filtered_repo_update_paths = {}
 
            for repo_name, repo in self.repo_paths.items():
 
                if repo_name in repo_update_list:
 
                    self.filtered_repo_update_paths[repo_name] = repo
 
            self.repo_paths = self.filtered_repo_update_paths
 

	
 
        self.initial = True
 
        if not os.path.isdir(self.index_location):
 
            os.makedirs(self.index_location)
 
            log.info('Cannot run incremental index since it does not '
 
                     'yet exist running full build')
 
                     'yet exist - running full build')
 
        elif not exists_in(self.index_location, IDX_NAME):
 
            log.info('Running full index build as the file content '
 
            log.info('Running full index build, as the file content '
 
                     'index does not exist')
 
        elif not exists_in(self.index_location, CHGSET_IDX_NAME):
 
            log.info('Running full index build as the changeset '
 
            log.info('Running full index build, as the changeset '
 
                     'index does not exist')
 
        else:
 
            self.initial = False
 

	
 
    def _get_index_revision(self, repo):
 
        db_repo = Repository.get_by_repo_name(repo.name_unicode)
 
        landing_rev = 'tip'
 
        if db_repo:
 
            _rev_type, _rev = db_repo.landing_rev
 
            landing_rev = _rev
 
        return landing_rev
 

	
 
@@ -170,40 +170,39 @@ class WhooshIndexingDaemon(object):
 

	
 
    def get_node_mtime(self, node):
 
        return mktime(node.last_changeset.date.timetuple())
 

	
 
    def add_doc(self, writer, path, repo, repo_name, index_rev=None):
 
        """
 
        Adding doc to writer this function itself fetches data from
 
        the instance of vcs backend
 
        """
 
        try:
 
            node = self.get_node(repo, path, index_rev)
 
        except (ChangesetError, NodeDoesNotExistError):
 
            log.debug("couldn't add doc - %s did not have %r at %s", repo, path, index_rev)
 
            log.debug("    >> %s - not found in %s %s", path, repo, index_rev)
 
            return 0, 0
 

	
 
        indexed = indexed_w_content = 0
 
        if self.is_indexable_node(node):
 
            u_content = node.content
 
            if not isinstance(u_content, unicode):
 
                log.warning('  >> %s Could not get this content as unicode '
 
                            'replacing with empty content' % path)
 
                log.warning('    >> %s - no text content', path)
 
                u_content = u''
 
            else:
 
                log.debug('    >> %s [WITH CONTENT]', path)
 
                log.debug('    >> %s', path)
 
                indexed_w_content += 1
 

	
 
        else:
 
            log.debug('    >> %s', path)
 
            log.debug('    >> %s - not indexable', path)
 
            # just index file name without it's content
 
            u_content = u''
 
            indexed += 1
 

	
 
        p = safe_unicode(path)
 
        writer.add_document(
 
            fileid=p,
 
            owner=unicode(repo.contact),
 
            repository_rawname=safe_unicode(repo_name),
 
            repository=safe_unicode(repo_name),
 
            path=p,
 
            content=u_content,
 
@@ -221,82 +220,82 @@ class WhooshIndexingDaemon(object):
 
        :param repo_name: name of the repository from whence the
 
          changeset originates including the repository group
 
        :param repo: the vcs repository instance to index changesets for,
 
          the presumption is the repo has changesets to index
 
        :param start_rev=None: the full sha id to start indexing from
 
          if start_rev is None then index from the first changeset in
 
          the repo
 
        """
 

	
 
        if start_rev is None:
 
            start_rev = repo[0].raw_id
 

	
 
        log.debug('indexing changesets in %s starting at rev: %s',
 
        log.debug('Indexing changesets in %s, starting at rev %s',
 
                  repo_name, start_rev)
 

	
 
        indexed = 0
 
        cs_iter = repo.get_changesets(start=start_rev)
 
        total = len(cs_iter)
 
        for cs in cs_iter:
 
            log.debug('    >> %s/%s', cs, total)
 
            indexed += 1
 
            log.debug('    >> %s %s/%s', cs, indexed, total)
 
            writer.add_document(
 
                raw_id=unicode(cs.raw_id),
 
                owner=unicode(repo.contact),
 
                date=cs._timestamp,
 
                repository_rawname=safe_unicode(repo_name),
 
                repository=safe_unicode(repo_name),
 
                author=cs.author,
 
                message=cs.message,
 
                last=cs.last,
 
                added=u' '.join([safe_unicode(node.path) for node in cs.added]).lower(),
 
                removed=u' '.join([safe_unicode(node.path) for node in cs.removed]).lower(),
 
                changed=u' '.join([safe_unicode(node.path) for node in cs.changed]).lower(),
 
                parents=u' '.join([cs.raw_id for cs in cs.parents]),
 
            )
 
            indexed += 1
 

	
 
        log.debug('indexed %d changesets for repo %s', indexed, repo_name)
 
        return indexed
 

	
 
    def index_files(self, file_idx_writer, repo_name, repo):
 
        """
 
        Index files for given repo_name
 

	
 
        :param file_idx_writer: the whoosh index writer to add to
 
        :param repo_name: name of the repository we're indexing
 
        :param repo: instance of vcs repo
 
        """
 
        i_cnt = iwc_cnt = 0
 
        log.debug('building index for %s @revision:%s', repo.path,
 
        log.debug('Building file index for %s @revision:%s', repo_name,
 
                                                self._get_index_revision(repo))
 
        index_rev = self._get_index_revision(repo)
 
        for idx_path in self.get_paths(repo):
 
            i, iwc = self.add_doc(file_idx_writer, idx_path, repo, repo_name, index_rev)
 
            i_cnt += i
 
            iwc_cnt += iwc
 

	
 
        log.debug('added %s files %s with content for repo %s',
 
                  i_cnt + iwc_cnt, iwc_cnt, repo.path)
 
        return i_cnt, iwc_cnt
 

	
 
    def update_changeset_index(self):
 
        idx = open_dir(self.index_location, indexname=CHGSET_IDX_NAME)
 

	
 
        with idx.searcher() as searcher:
 
            writer = idx.writer()
 
            writer_is_dirty = False
 
            try:
 
                indexed_total = 0
 
                repo_name = None
 
                for repo_name, repo in self.repo_paths.items():
 
                for repo_name, repo in sorted(self.repo_paths.items()):
 
                    log.debug('Updating changeset index for repo %s', repo_name)
 
                    # skip indexing if there aren't any revs in the repo
 
                    num_of_revs = len(repo)
 
                    if num_of_revs < 1:
 
                        continue
 

	
 
                    qp = QueryParser('repository', schema=CHGSETS_SCHEMA)
 
                    q = qp.parse(u"last:t AND %s" % repo_name)
 

	
 
                    results = searcher.search(q)
 

	
 
                    # default to scanning the entire repo
 
                    last_rev = 0
 
@@ -376,39 +375,39 @@ class WhooshIndexingDaemon(object):
 
                            to_index.add(indexed_path)
 
                    except (ChangesetError, NodeDoesNotExistError):
 
                        # This file was deleted since it was indexed
 
                        log.debug('removing from index %s', indexed_path)
 
                        writer.delete_by_term('path', indexed_path)
 
                        writer_is_dirty = True
 

	
 
            # Loop over the files in the filesystem
 
            # Assume we have a function that gathers the filenames of the
 
            # documents to be indexed
 
            ri_cnt_total = 0  # indexed
 
            riwc_cnt_total = 0  # indexed with content
 
            for repo_name, repo in self.repo_paths.items():
 
            for repo_name, repo in sorted(self.repo_paths.items()):
 
                log.debug('Updating file index for repo %s', repo_name)
 
                # skip indexing if there aren't any revisions
 
                if len(repo) < 1:
 
                    continue
 
                ri_cnt = 0   # indexed
 
                riwc_cnt = 0  # indexed with content
 
                for path in self.get_paths(repo):
 
                    path = safe_unicode(path)
 
                    if path in to_index or path not in indexed_paths:
 

	
 
                        # This is either a file that's changed, or a new file
 
                        # that wasn't indexed before. So index it!
 
                        i, iwc = self.add_doc(writer, path, repo, repo_name)
 
                        writer_is_dirty = True
 
                        log.debug('re indexing %s', path)
 
                        ri_cnt += i
 
                        ri_cnt_total += 1
 
                        riwc_cnt += iwc
 
                        riwc_cnt_total += iwc
 
                log.debug('added %s files %s with content for repo %s',
 
                             ri_cnt + riwc_cnt, riwc_cnt, repo.path
 
                )
 
            log.debug('indexed %s files in total and %s with content',
 
                        ri_cnt_total, riwc_cnt_total
 
            )
 
        finally:
 
            if writer_is_dirty:
 
@@ -427,25 +426,26 @@ class WhooshIndexingDaemon(object):
 
        if not os.path.exists(self.index_location):
 
            os.mkdir(self.index_location)
 

	
 
        chgset_idx = create_in(self.index_location, CHGSETS_SCHEMA,
 
                               indexname=CHGSET_IDX_NAME)
 
        chgset_idx_writer = chgset_idx.writer()
 

	
 
        file_idx = create_in(self.index_location, SCHEMA, indexname=IDX_NAME)
 
        file_idx_writer = file_idx.writer()
 
        log.debug('BUILDING INDEX FOR EXTENSIONS %s '
 
                  'AND REPOS %s' % (INDEX_EXTENSIONS, self.repo_paths.keys()))
 

	
 
        for repo_name, repo in self.repo_paths.items():
 
        for repo_name, repo in sorted(self.repo_paths.items()):
 
            log.debug('Updating indices for repo %s', repo_name)
 
            # skip indexing if there aren't any revisions
 
            if len(repo) < 1:
 
                continue
 

	
 
            self.index_files(file_idx_writer, repo_name, repo)
 
            self.index_changesets(chgset_idx_writer, repo_name, repo)
 

	
 
        log.debug('>> COMMITING CHANGES <<')
 
        file_idx_writer.commit(merge=True)
 
        chgset_idx_writer.commit(merge=True)
 
        log.debug('>>> FINISHED BUILDING INDEX <<<')
 

	
0 comments (0 inline, 0 general)