Changeset - 3072935bdeed
[Not reviewed]
default
1 2 0
Marcin Kuzminski - 15 years ago 2010-10-09 00:22:19
marcin@python-works.com
rewrote whoosh indexing to run internal repository.walk() instead of filesystem.
Disabled default hg update hook (not needed since whoosh is not dependent on file system files to index)
3 files changed with 35 insertions and 211 deletions:
0 comments (0 inline, 0 general)
rhodecode/lib/db_manage.py
Show inline comments
 
@@ -22,192 +22,193 @@
 
Created on April 10, 2010
 
database managment and creation for hg app
 
@author: marcink
 
"""
 

	
 
from os.path import dirname as dn, join as jn
 
import os
 
import sys
 
import uuid
 

	
 
from rhodecode.lib.auth import get_crypt_password
 
from rhodecode.lib.utils import ask_ok
 
from rhodecode.model import init_model
 
from rhodecode.model.db import User, Permission, RhodeCodeUi, RhodeCodeSettings, \
 
    UserToPerm
 
from rhodecode.model import meta
 
from sqlalchemy.engine import create_engine
 
import logging
 

	
 
log = logging.getLogger(__name__)
 

	
 
class DbManage(object):
 
    def __init__(self, log_sql, dbname, root, tests=False):
 
        self.dbname = dbname
 
        self.tests = tests
 
        self.root = root
 
        dburi = 'sqlite:////%s' % jn(self.root, self.dbname)
 
        engine = create_engine(dburi, echo=log_sql) 
 
        init_model(engine)
 
        self.sa = meta.Session
 
        self.db_exists = False
 
    
 
    def check_for_db(self, override):
 
        db_path = jn(self.root, self.dbname)
 
        log.info('checking for existing db in %s', db_path)
 
        if os.path.isfile(db_path):
 
            self.db_exists = True
 
            log.info('database exist')
 
            if not override:
 
                raise Exception('database already exists')
 

	
 
    def create_tables(self, override=False):
 
        """
 
        Create a auth database
 
        """
 
        self.check_for_db(override)
 
        if override:
 
            log.info("database exist and it's going to be destroyed")
 
            if self.tests:
 
                destroy = True
 
            else:
 
                destroy = ask_ok('Are you sure to destroy old database ? [y/n]')
 
            if not destroy:
 
                sys.exit()
 
            if self.db_exists and destroy:
 
                os.remove(jn(self.root, self.dbname))
 
        checkfirst = not override
 
        meta.Base.metadata.create_all(checkfirst=checkfirst)
 
        log.info('Created tables for %s', self.dbname)
 
    
 
    def admin_prompt(self):
 
        if not self.tests:
 
            import getpass
 
            username = raw_input('Specify admin username:')
 
            password = getpass.getpass('Specify admin password:')
 
            confirm = getpass.getpass('Confirm password:')
 
            if password != confirm:
 
                log.error('passwords mismatch')
 
                sys.exit()
 
            email = raw_input('Specify admin email:')
 
            self.create_user(username, password, email, True)
 
        else:
 
            log.info('creating admin and regular test users')
 
            self.create_user('test_admin', 'test12', 'test_admin@mail.com', True)
 
            self.create_user('test_regular', 'test12', 'test_regular@mail.com', False)
 
            self.create_user('test_regular2', 'test12', 'test_regular2@mail.com', False)
 
            
 
        
 
    
 
    def config_prompt(self, test_repo_path=''):
 
        log.info('Setting up repositories config')
 
        
 
        if not self.tests and not test_repo_path:
 
            path = raw_input('Specify valid full path to your repositories'
 
                        ' you can change this later in application settings:')
 
        else:
 
            path = test_repo_path
 
            
 
        if not os.path.isdir(path):
 
            log.error('You entered wrong path: %s', path)
 
            sys.exit()
 
        
 
        hooks1 = RhodeCodeUi()
 
        hooks1.ui_section = 'hooks'
 
        hooks1.ui_key = 'changegroup.update'
 
        hooks1.ui_value = 'hg update >&2'
 
        hooks1.ui_active = False
 
        
 
        hooks2 = RhodeCodeUi()
 
        hooks2.ui_section = 'hooks'
 
        hooks2.ui_key = 'changegroup.repo_size'
 
        hooks2.ui_value = 'python:rhodecode.lib.hooks.repo_size' 
 
                
 
        web1 = RhodeCodeUi()
 
        web1.ui_section = 'web'
 
        web1.ui_key = 'push_ssl'
 
        web1.ui_value = 'false'
 
                
 
        web2 = RhodeCodeUi()
 
        web2.ui_section = 'web'
 
        web2.ui_key = 'allow_archive'
 
        web2.ui_value = 'gz zip bz2'
 
                
 
        web3 = RhodeCodeUi()
 
        web3.ui_section = 'web'
 
        web3.ui_key = 'allow_push'
 
        web3.ui_value = '*'
 
        
 
        web4 = RhodeCodeUi()
 
        web4.ui_section = 'web'
 
        web4.ui_key = 'baseurl'
 
        web4.ui_value = '/'                        
 
        
 
        paths = RhodeCodeUi()
 
        paths.ui_section = 'paths'
 
        paths.ui_key = '/'
 
        paths.ui_value = os.path.join(path, '*')
 
        
 
        
 
        hgsettings1 = RhodeCodeSettings()
 
        
 
        hgsettings1.app_settings_name = 'realm'
 
        hgsettings1.app_settings_value = 'RhodeCode authentication'
 
        
 
        hgsettings2 = RhodeCodeSettings()
 
        hgsettings2.app_settings_name = 'title'
 
        hgsettings2.app_settings_value = 'RhodeCode'      
 
        
 
        try:
 
            self.sa.add(hooks1)
 
            self.sa.add(hooks2)
 
            self.sa.add(web1)
 
            self.sa.add(web2)
 
            self.sa.add(web3)
 
            self.sa.add(web4)
 
            self.sa.add(paths)
 
            self.sa.add(hgsettings1)
 
            self.sa.add(hgsettings2)
 
            self.sa.commit()
 
        except:
 
            self.sa.rollback()
 
            raise        
 
        log.info('created ui config')
 
                    
 
    def create_user(self, username, password, email='', admin=False):
 
        log.info('creating administrator user %s', username)
 
        new_user = User()
 
        new_user.username = username
 
        new_user.password = get_crypt_password(password)
 
        new_user.name = 'RhodeCode'
 
        new_user.lastname = 'Admin'
 
        new_user.email = email
 
        new_user.admin = admin
 
        new_user.active = True
 
        
 
        try:
 
            self.sa.add(new_user)
 
            self.sa.commit()
 
        except:
 
            self.sa.rollback()
 
            raise
 

	
 
    def create_default_user(self):
 
        log.info('creating default user')
 
        #create default user for handling default permissions.
 
        def_user = User()
 
        def_user.username = 'default'
 
        def_user.password = get_crypt_password(str(uuid.uuid1())[:8])
 
        def_user.name = 'default'
 
        def_user.lastname = 'default'
 
        def_user.email = 'default@default.com'
 
        def_user.admin = False
 
        def_user.active = False
 
        try:
 
            self.sa.add(def_user)
 
            self.sa.commit()
 
        except:
 
            self.sa.rollback()
 
            raise
 
    
 
    def create_permissions(self):
 
        #module.(access|create|change|delete)_[name]
 
        #module.(read|write|owner)
rhodecode/lib/indexers/daemon.py
Show inline comments
 
#!/usr/bin/env python
 
# encoding: utf-8
 
# whoosh indexer daemon for rhodecode
 
# Copyright (C) 2009-2010 Marcin Kuzminski <marcin@python-works.com>
 
#
 
# This program is free software; you can redistribute it and/or
 
# modify it under the terms of the GNU General Public License
 
# as published by the Free Software Foundation; version 2
 
# of the License or (at your opinion) any later version of the license.
 
# 
 
# This program is distributed in the hope that it will be useful,
 
# but WITHOUT ANY WARRANTY; without even the implied warranty of
 
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 
# GNU General Public License for more details.
 
# 
 
# You should have received a copy of the GNU General Public License
 
# along with this program; if not, write to the Free Software
 
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
 
# MA  02110-1301, USA.
 
"""
 
Created on Jan 26, 2010
 

	
 
@author: marcink
 
A deamon will read from task table and run tasks
 
"""
 
import sys
 
import os
 
from os.path import dirname as dn
 
from os.path import join as jn
 

	
 
#to get the rhodecode import
 
project_path = dn(dn(dn(dn(os.path.realpath(__file__)))))
 
sys.path.append(project_path)
 

	
 
from rhodecode.lib.pidlock import LockHeld, DaemonLock
 
from rhodecode.model.hg_model import HgModel
 
from rhodecode.lib.helpers import safe_unicode
 
from whoosh.index import create_in, open_dir
 
from shutil import rmtree
 
from rhodecode.lib.indexers import INDEX_EXTENSIONS, IDX_LOCATION, SCHEMA, IDX_NAME
 

	
 
from time import mktime
 
from vcs.backends import hg
 

	
 
import logging
 

	
 
log = logging.getLogger('whooshIndexer')
 
# create logger
 
log.setLevel(logging.DEBUG)
 
log.propagate = False
 
# create console handler and set level to debug
 
ch = logging.StreamHandler()
 
ch.setLevel(logging.DEBUG)
 

	
 
# create formatter
 
formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
 

	
 
# add formatter to ch
 
ch.setFormatter(formatter)
 

	
 
# add ch to logger
 
log.addHandler(ch)
 

	
 
def scan_paths(root_location):
 
    return HgModel.repo_scan('/', root_location, None, True)
 

	
 
class WhooshIndexingDaemon(object):
 
    """Deamon for atomic jobs"""
 
    """
 
    Deamon for atomic jobs
 
    """
 

	
 
    def __init__(self, indexname='HG_INDEX', repo_location=None):
 
        self.indexname = indexname
 
        self.repo_location = repo_location
 
        self.initial = False
 
        if not os.path.isdir(IDX_LOCATION):
 
            os.mkdir(IDX_LOCATION)
 
            log.info('Cannot run incremental index since it does not'
 
                     ' yet exist running full build')
 
            self.initial = True
 
    
 
        
 
    def get_paths(self, root_dir):
 
        """recursive walk in root dir and return a set of all path in that dir
 
        excluding files in .hg dir"""
 
        """
 
        recursive walk in root dir and return a set of all path in that dir
 
        based on repository walk function
 
        """
 
        repo = hg.MercurialRepository(root_dir)
 
        index_paths_ = set()
 
        for path, dirs, files in os.walk(root_dir):
 
            if path.find('.hg') == -1:
 
        for topnode, dirs, files in repo.walk('/', 'tip'):
 
            for f in files:
 
                index_paths_.add(jn(root_dir, f.path))
 
            for dir in dirs:
 
                for f in files:
 
                    index_paths_.add(jn(path, f))
 
    
 
        return index_paths_
 
    
 
                    index_paths_.add(jn(root_dir, f.path))
 
            
 
        return index_paths_        
 

	
 

	
 
    def add_doc(self, writer, path, repo):
 
        """Adding doc to writer"""
 
        
 
        ext = unicode(path.split('/')[-1].split('.')[-1].lower())
 
        #we just index the content of choosen files
 
        if ext in INDEX_EXTENSIONS:
 
        n_path = path[len(repo.path) + 1:]
 
        node = repo.get_changeset().get_node(n_path)
 

	
 
        #we just index the content of chosen files
 
        if node.extension in INDEX_EXTENSIONS:
 
            log.debug('    >> %s [WITH CONTENT]' % path)
 
            fobj = open(path, 'rb')
 
            content = fobj.read()
 
            fobj.close()
 
            u_content = safe_unicode(content)
 
            u_content = node.content
 
        else:
 
            log.debug('    >> %s' % path)
 
            #just index file name without it's content
 
            u_content = u''
 
        
 
        
 
        
 
        try:
 
            os.stat(path)
 
            writer.add_document(owner=unicode(repo.contact),
 
                            repository=safe_unicode(repo.name),
 
                            path=safe_unicode(path),
 
                            content=u_content,
 
                            modtime=os.path.getmtime(path),
 
                            extension=ext)             
 
        except OSError, e:
 
            import errno
 
            if e.errno == errno.ENOENT:
 
                log.debug('path %s does not exist or is a broken symlink' % path)
 
            else:
 
                raise e                 
 
        writer.add_document(owner=unicode(repo.contact),
 
                        repository=safe_unicode(repo.name),
 
                        path=safe_unicode(path),
 
                        content=u_content,
 
                        modtime=mktime(node.last_changeset.date.timetuple()),
 
                        extension=node.extension)             
 

	
 
    
 
    def build_index(self):
 
        if os.path.exists(IDX_LOCATION):
 
            log.debug('removing previos index')
 
            log.debug('removing previous index')
 
            rmtree(IDX_LOCATION)
 
            
 
        if not os.path.exists(IDX_LOCATION):
 
            os.mkdir(IDX_LOCATION)
 
        
 
        idx = create_in(IDX_LOCATION, SCHEMA, indexname=IDX_NAME)
 
        writer = idx.writer()
 
        
 
        for cnt, repo in enumerate(scan_paths(self.repo_location).values()):
 
            log.debug('building index @ %s' % repo.path)
 
        
 
            for idx_path in self.get_paths(repo.path):
 
                self.add_doc(writer, idx_path, repo)
 
        writer.commit(merge=True)
 
                
 
        log.debug('>>> FINISHED BUILDING INDEX <<<')
 
            
 
    
 
    def update_index(self):
 
        log.debug('STARTING INCREMENTAL INDEXING UPDATE')
 
            
 
        idx = open_dir(IDX_LOCATION, indexname=self.indexname)
 
        # The set of all paths in the index
 
        indexed_paths = set()
 
        # The set of all paths we need to re-index
 
        to_index = set()
 
        
 
        reader = idx.reader()
 
        writer = idx.writer()
 
    
 
        # Loop over the stored fields in the index
 
        for fields in reader.all_stored_fields():
 
            indexed_path = fields['path']
 
            indexed_paths.add(indexed_path)
 
    
 
            if not os.path.exists(indexed_path):
 
                # This file was deleted since it was indexed
 
                log.debug('removing from index %s' % indexed_path)
 
                writer.delete_by_term('path', indexed_path)
 
    
 
            else:
 
                # Check if this file was changed since it
 
                # was indexed
 
                indexed_time = fields['modtime']
 
                
 
                mtime = os.path.getmtime(indexed_path)
 
    
 
                if mtime > indexed_time:
 
    
 
                    # The file has changed, delete it and add it to the list of
 
                    # files to reindex
 
                    log.debug('adding to reindex list %s' % indexed_path)
 
                    writer.delete_by_term('path', indexed_path)
 
                    to_index.add(indexed_path)
 
                    #writer.commit()
 
    
 
        # Loop over the files in the filesystem
 
        # Assume we have a function that gathers the filenames of the
 
        # documents to be indexed
 
        for repo in scan_paths(self.repo_location).values():
 
            for path in self.get_paths(repo.path):
 
                if path in to_index or path not in indexed_paths:
 
                    # This is either a file that's changed, or a new file
 
                    # that wasn't indexed before. So index it!
 
                    self.add_doc(writer, path, repo)
 
                    log.debug('reindexing %s' % path)
 
    
 
        writer.commit(merge=True)
 
        #idx.optimize()
 
        log.debug('>>> FINISHED <<<')
 
        
 
    def run(self, full_index=False):
 
        """Run daemon"""
 
        if full_index or self.initial:
 
            self.build_index()
 
        else:
 
            self.update_index()
 
        
 
if __name__ == "__main__":
 
    arg = sys.argv[1:]
 
    if len(arg) != 2:
 
        sys.stderr.write('Please specify indexing type [full|incremental]' 
 
                         'and path to repositories as script args \n')
 
        sys.exit()
 
    
 
    
 
    if arg[0] == 'full':
 
        full_index = True
 
    elif arg[0] == 'incremental':
 
        # False means looking just for changes
 
        full_index = False
 
    else:
 
        sys.stdout.write('Please use [full|incremental]' 
 
                         ' as script first arg \n')
 
        sys.exit()
 
    
rhodecode/lib/indexers/multiprocessing_indexer.py
Show inline comments
 
deleted file
0 comments (0 inline, 0 general)