Changeset - e01a85f9fc90
[Not reviewed]
default
0 1 0
Marcin Kuzminski - 15 years ago 2010-09-08 01:33:38
marcin@python-works.com
fixed initial whoosh indexer. Build full index on first run even with incremental flag
1 file changed with 7 insertions and 1 deletions:
0 comments (0 inline, 0 general)
pylons_app/lib/indexers/daemon.py
Show inline comments
 
@@ -11,96 +11,102 @@
 
# This program is distributed in the hope that it will be useful,
 
# but WITHOUT ANY WARRANTY; without even the implied warranty of
 
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 
# GNU General Public License for more details.
 
# 
 
# You should have received a copy of the GNU General Public License
 
# along with this program; if not, write to the Free Software
 
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
 
# MA  02110-1301, USA.
 
"""
 
Created on Jan 26, 2010
 

	
 
@author: marcink
 
A deamon will read from task table and run tasks
 
"""
 
import sys
 
import os
 
from os.path import dirname as dn
 
from os.path import join as jn
 

	
 
#to get the pylons_app import
 
project_path = dn(dn(dn(dn(os.path.realpath(__file__)))))
 
sys.path.append(project_path)
 

	
 
from pidlock import LockHeld, DaemonLock
 
import traceback
 
from pylons_app.config.environment import load_environment
 
from pylons_app.model.hg_model import HgModel
 
from pylons_app.lib.helpers import safe_unicode
 
from whoosh.index import create_in, open_dir
 
from shutil import rmtree
 
from pylons_app.lib.indexers import ANALYZER, INDEX_EXTENSIONS, IDX_LOCATION, \
 
SCHEMA, IDX_NAME
 

	
 
import logging
 
import logging.config
 
logging.config.fileConfig(jn(project_path, 'development.ini'))
 
log = logging.getLogger('whooshIndexer')
 

	
 
def scan_paths(root_location):
 
    return HgModel.repo_scan('/', root_location, None, True)
 

	
 
class WhooshIndexingDaemon(object):
 
    """Deamon for atomic jobs"""
 

	
 
    def __init__(self, indexname='HG_INDEX', repo_location=None):
 
        self.indexname = indexname
 
        self.repo_location = repo_location
 
        self.initial = False
 
        if not os.path.isdir(IDX_LOCATION):
 
            os.mkdir(IDX_LOCATION)
 
            log.info('Cannot run incremental index since it does not'
 
                     ' yet exist running full build')
 
            self.initial = True
 
    
 
    def get_paths(self, root_dir):
 
        """recursive walk in root dir and return a set of all path in that dir
 
        excluding files in .hg dir"""
 
        index_paths_ = set()
 
        for path, dirs, files in os.walk(root_dir):
 
            if path.find('.hg') == -1:
 
                for f in files:
 
                    index_paths_.add(jn(path, f))
 
    
 
        return index_paths_
 
    
 
    def add_doc(self, writer, path, repo):
 
        """Adding doc to writer"""
 
        
 
        ext = unicode(path.split('/')[-1].split('.')[-1].lower())
 
        #we just index the content of choosen files
 
        if ext in INDEX_EXTENSIONS:
 
            log.debug('    >> %s [WITH CONTENT]' % path)
 
            fobj = open(path, 'rb')
 
            content = fobj.read()
 
            fobj.close()
 
            u_content = safe_unicode(content)
 
        else:
 
            log.debug('    >> %s' % path)
 
            #just index file name without it's content
 
            u_content = u''
 
        
 
        
 
        
 
        try:
 
            os.stat(path)
 
            writer.add_document(owner=unicode(repo.contact),
 
                            repository=u"%s" % repo.name,
 
                            path=u"%s" % path,
 
                            content=u_content,
 
                            modtime=os.path.getmtime(path),
 
                            extension=ext)             
 
        except OSError, e:
 
            import errno
 
            if e.errno == errno.ENOENT:
 
                log.debug('path %s does not exist or is a broken symlink' % path)
 
            else:
 
                raise e                 
 

	
 
    
 
    def build_index(self):
 
        if os.path.exists(IDX_LOCATION):
 
@@ -133,88 +139,88 @@ class WhooshIndexingDaemon(object):
 
        to_index = set()
 
        
 
        reader = idx.reader()
 
        writer = idx.writer()
 
    
 
        # Loop over the stored fields in the index
 
        for fields in reader.all_stored_fields():
 
            indexed_path = fields['path']
 
            indexed_paths.add(indexed_path)
 
    
 
            if not os.path.exists(indexed_path):
 
                # This file was deleted since it was indexed
 
                log.debug('removing from index %s' % indexed_path)
 
                writer.delete_by_term('path', indexed_path)
 
    
 
            else:
 
                # Check if this file was changed since it
 
                # was indexed
 
                indexed_time = fields['modtime']
 
                
 
                mtime = os.path.getmtime(indexed_path)
 
    
 
                if mtime > indexed_time:
 
    
 
                    # The file has changed, delete it and add it to the list of
 
                    # files to reindex
 
                    log.debug('adding to reindex list %s' % indexed_path)
 
                    writer.delete_by_term('path', indexed_path)
 
                    to_index.add(indexed_path)
 
                    #writer.commit()
 
    
 
        # Loop over the files in the filesystem
 
        # Assume we have a function that gathers the filenames of the
 
        # documents to be indexed
 
        for repo in scan_paths(self.repo_location).values():
 
            for path in self.get_paths(repo.path):
 
                if path in to_index or path not in indexed_paths:
 
                    # This is either a file that's changed, or a new file
 
                    # that wasn't indexed before. So index it!
 
                    self.add_doc(writer, path, repo)
 
                    log.debug('reindexing %s' % path)
 
    
 
        writer.commit(merge=True)
 
        #idx.optimize()
 
        log.debug('>>> FINISHED <<<')
 
        
 
    def run(self, full_index=False):
 
        """Run daemon"""
 
        if full_index:
 
        if full_index or self.initial:
 
            self.build_index()
 
        else:
 
            self.update_index()
 
        
 
if __name__ == "__main__":
 
    arg = sys.argv[1:]
 
    if len(arg) != 2:
 
        sys.stderr.write('Please specify indexing type [full|incremental]' 
 
                         'and path to repositories as script args \n')
 
        sys.exit()
 
    
 
    
 
    if arg[0] == 'full':
 
        full_index = True
 
    elif arg[0] == 'incremental':
 
        # False means looking just for changes
 
        full_index = False
 
    else:
 
        sys.stdout.write('Please use [full|incremental]' 
 
                         ' as script first arg \n')
 
        sys.exit()
 
    
 
    if not os.path.isdir(arg[1]):
 
        sys.stderr.write('%s is not a valid path \n' % arg[1])
 
        sys.exit()
 
    else:
 
        if arg[1].endswith('/'):
 
            repo_location = arg[1] + '*'
 
        else:
 
            repo_location = arg[1] + '/*'
 
    
 
    try:
 
        l = DaemonLock()
 
        WhooshIndexingDaemon(repo_location=repo_location)\
 
            .run(full_index=full_index)
 
        l.release()
 
    except LockHeld:
 
        sys.exit(1)
 

	
0 comments (0 inline, 0 general)