kallithea Changeset - 29ec9ddbe258

Changeset - 29ec9ddbe258

Parent rev.

Child rev.

[Not reviewed]

default

0 1 0

Marcin Kuzminski - 15 years ago 2010-10-07 18:30:50
marcin@python-works.com

fixed whoosh indexing possible unicode decode errors

1 file changed with 2 insertions and 2 deletions:

rhodecode/lib/indexers/daemon.py

0 comments (0 inline, 0 general)

rhodecode/lib/indexers/daemon.py

➞

Show inline comments

@@ @@ -61,98 +61,98 @@ log.addHandler(ch) @@
 def scan_paths(root_location):
     return HgModel.repo_scan('/', root_location, None, True)
 class WhooshIndexingDaemon(object):
     """Deamon for atomic jobs"""
     def __init__(self, indexname='HG_INDEX', repo_location=None):
         self.indexname = indexname
         self.repo_location = repo_location
         self.initial = False
         if not os.path.isdir(IDX_LOCATION):
             os.mkdir(IDX_LOCATION)
             log.info('Cannot run incremental index since it does not'
                      ' yet exist running full build')
             self.initial = True
     def get_paths(self, root_dir):
         """recursive walk in root dir and return a set of all path in that dir
         excluding files in .hg dir"""
         index_paths_ = set()
         for path, dirs, files in os.walk(root_dir):
             if path.find('.hg') == -1:
                 for f in files:
                     index_paths_.add(jn(path, f))
         return index_paths_
     def add_doc(self, writer, path, repo):
         """Adding doc to writer"""
         ext = unicode(path.split('/')[-1].split('.')[-1].lower())
         #we just index the content of choosen files
         if ext in INDEX_EXTENSIONS:
             log.debug('    >> %s [WITH CONTENT]' % path)
             fobj = open(path, 'rb')
             content = fobj.read()
             fobj.close()
             u_content = safe_unicode(content)
         else:
             log.debug('    >> %s' % path)
             #just index file name without it's content
             u_content = u''
         try:
             os.stat(path)
             writer.add_document(owner=unicode(repo.contact),
                             repository=u"%s" % repo.name,
                             path=u"%s" % path,
                             repository=safe_unicode(repo.name),
                             path=safe_unicode(path),
                             content=u_content,
                             modtime=os.path.getmtime(path),
                             extension=ext)
         except OSError, e:
             import errno
             if e.errno == errno.ENOENT:
                 log.debug('path %s does not exist or is a broken symlink' % path)
             else:
                 raise e
     def build_index(self):
         if os.path.exists(IDX_LOCATION):
             log.debug('removing previos index')
             rmtree(IDX_LOCATION)
         if not os.path.exists(IDX_LOCATION):
             os.mkdir(IDX_LOCATION)
         idx = create_in(IDX_LOCATION, SCHEMA, indexname=IDX_NAME)
         writer = idx.writer()
         for cnt, repo in enumerate(scan_paths(self.repo_location).values()):
             log.debug('building index @ %s' % repo.path)
             for idx_path in self.get_paths(repo.path):
                 self.add_doc(writer, idx_path, repo)
         writer.commit(merge=True)
         log.debug('>>> FINISHED BUILDING INDEX <<<')
     def update_index(self):
         log.debug('STARTING INCREMENTAL INDEXING UPDATE')
         idx = open_dir(IDX_LOCATION, indexname=self.indexname)
         # The set of all paths in the index
         indexed_paths = set()
         # The set of all paths we need to re-index
         to_index = set()
         reader = idx.reader()
         writer = idx.writer()
         # Loop over the stored fields in the index
         for fields in reader.all_stored_fields():
             indexed_path = fields['path']
             indexed_paths.add(indexed_path)

0 comments (0 inline, 0 general)