kallithea Changeset - 20699dd652ff

Changeset - 20699dd652ff

Parent rev.

Child rev.

[Not reviewed]

stable

0 1 0

Mads Kiilerich - 10 years ago 2015-12-25 12:32:47
madski@unity3d.com

indexer: skip documents that can't be retrieved - probably because encoding issues (Issue #175)

This doesn't fix the encoding issue but it makes it less fatal.

1 file changed with 5 insertions and 1 deletions:

kallithea/lib/indexers/daemon.py

0 comments (0 inline, 0 general)

kallithea/lib/indexers/daemon.py

➞

Show inline comments

@@ @@ -125,98 +125,102 @@ class WhooshIndexingDaemon(object): @@
             index_rev = self._get_index_revision(repo)
         cs = repo.get_changeset(index_rev)
         return cs
     def get_paths(self, repo):
         """
         recursive walk in root dir and return a set of all path in that dir
         based on repository walk function
         """
         index_paths_ = set()
         try:
             cs = self._get_index_changeset(repo)
             for _topnode, _dirs, files in cs.walk('/'):
                 for f in files:
                     index_paths_.add(jn(safe_str(repo.path), safe_str(f.path)))
         except RepositoryError:
             log.debug(traceback.format_exc())
             pass
         return index_paths_
     def get_node(self, repo, path, index_rev=None):
         """
         gets a filenode based on given full path. It operates on string for
         hg git compatibility.
         :param repo: scm repo instance
         :param path: full path including root location
         :return: FileNode
         """
         # FIXME: paths should be normalized ... or even better: don't include repo.path
         path = safe_str(path)
         repo_path = safe_str(repo.path)
         assert path.startswith(repo_path)
         assert path[len(repo_path)] in (os.path.sep, os.path.altsep)
         node_path = path[len(repo_path) + 1:]
         cs = self._get_index_changeset(repo, index_rev=index_rev)
         node = cs.get_node(node_path)
         return node
     def get_node_mtime(self, node):
         return mktime(node.last_changeset.date.timetuple())
     def add_doc(self, writer, path, repo, repo_name, index_rev=None):
         """
         Adding doc to writer this function itself fetches data from
         the instance of vcs backend
         """
         try:
             node = self.get_node(repo, path, index_rev)
         except (ChangesetError, NodeDoesNotExistError):
             log.debug("couldn't add doc - %s did not have %r at %s", repo, path, index_rev)
             return 0, 0
         node = self.get_node(repo, path, index_rev)
         indexed = indexed_w_content = 0
         # we just index the content of chosen files, and skip binary files
         if node.extension in INDEX_EXTENSIONS and not node.is_binary:
             u_content = node.content
             if not isinstance(u_content, unicode):
                 log.warning('  >> %s Could not get this content as unicode '
                             'replacing with empty content' % path)
                 u_content = u''
             else:
                 log.debug('    >> %s [WITH CONTENT]', path)
                 indexed_w_content += 1
         else:
             log.debug('    >> %s', path)
             # just index file name without it's content
             u_content = u''
             indexed += 1
         p = safe_unicode(path)
         writer.add_document(
             fileid=p,
             owner=unicode(repo.contact),
             repository=safe_unicode(repo_name),
             path=p,
             content=u_content,
             modtime=self.get_node_mtime(node),
             extension=node.extension
+        )
         return indexed, indexed_w_content
     def index_changesets(self, writer, repo_name, repo, start_rev=None):
         """
         Add all changeset in the vcs repo starting at start_rev
         to the index writer
         :param writer: the whoosh index writer to add to
         :param repo_name: name of the repository from whence the
           changeset originates including the repository group
         :param repo: the vcs repository instance to index changesets for,
           the presumption is the repo has changesets to index
         :param start_rev=None: the full sha id to start indexing from
           if start_rev is None then index from the first changeset in
           the repo
         """
         if start_rev is None:
             start_rev = repo[0].raw_id

0 comments (0 inline, 0 general)