Changeset - 186bf5fee0a1
[Not reviewed]
default
0 3 0
Mads Kiilerich - 9 years ago 2016-07-28 16:28:34
madski@unity3d.com
repo-scan: rewrite get_filesystem_repos to use os.walk instead of stupid recursion

I think this is more readable. It is also faster. Perhaps because the more
readable implementation makes it easier to optimize.
3 files changed with 39 insertions and 28 deletions:
0 comments (0 inline, 0 general)
kallithea/lib/paster_commands/repo_scan.py
Show inline comments
 
@@ -13,57 +13,59 @@
 
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
"""
 
kallithea.lib.paster_commands.repo_scan
 
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 

	
 
repo-scan paster command for Kallithea
 

	
 
This file was forked by the Kallithea project in July 2014.
 
Original author and date, and relevant copyright and licensing information is below:
 
:created_on: Feb 9, 2013
 
:author: marcink
 
:copyright: (c) 2013 RhodeCode GmbH, and others.
 
:license: GPLv3, see LICENSE.md for more details.
 
"""
 

	
 

	
 
import os
 
import sys
 

	
 
from kallithea.model.scm import ScmModel
 
from kallithea.lib.utils import BasePasterCommand, repo2db_mapper
 

	
 
# Add location of top level folder to sys.path
 
from os.path import dirname
 
rc_path = dirname(dirname(dirname(os.path.realpath(__file__))))
 
sys.path.append(rc_path)
 

	
 

	
 
class Command(BasePasterCommand):
 

	
 
    max_args = 1
 
    min_args = 1
 

	
 
    usage = "CONFIG_FILE"
 
    group_name = "Kallithea"
 
    takes_config_file = -1
 
    parser = BasePasterCommand.standard_parser(verbose=True)
 
    summary = "Rescan default location for new repositories"
 

	
 
    def command(self):
 
        #get SqlAlchemy session
 
        self._init_session()
 
        rm_obsolete = self.options.delete_obsolete
 
        print 'Now scanning root location for new repos ...'
 
        added, removed = repo2db_mapper(ScmModel().repo_scan(),
 
                                        remove_obsolete=rm_obsolete)
 
        added = ', '.join(added) or '-'
 
        removed = ', '.join(removed) or '-'
 
        print 'Scan completed added: %s removed: %s' % (added, removed)
 
        print 'Scan completed.'
 
        print 'Added: %s' % added
 
        print 'Removed: %s' % removed
 

	
 
    def update_parser(self):
 
        self.parser.add_option(
 
            '--delete-obsolete',
 
            action='store_true',
 
            help="Use this flag do delete repositories that are "
 
                 "present in Kallithea database but not on the filesystem",
 
        )
kallithea/lib/utils.py
Show inline comments
 
@@ -159,140 +159,149 @@ def action_logger(user, action, repo, ip
 
        that action was made on
 
    :param ipaddr: optional IP address from what the action was made
 
    :param sa: optional sqlalchemy session
 

	
 
    """
 

	
 
    if not sa:
 
        sa = meta.Session()
 
    # if we don't get explicit IP address try to get one from registered user
 
    # in tmpl context var
 
    if not ipaddr:
 
        ipaddr = getattr(get_current_authuser(), 'ip_addr', '')
 

	
 
    if getattr(user, 'user_id', None):
 
        user_obj = User.get(user.user_id)
 
    elif isinstance(user, basestring):
 
        user_obj = User.get_by_username(user)
 
    else:
 
        raise Exception('You have to provide a user object or a username')
 

	
 
    if getattr(repo, 'repo_id', None):
 
        repo_obj = Repository.get(repo.repo_id)
 
        repo_name = repo_obj.repo_name
 
    elif isinstance(repo, basestring):
 
        repo_name = repo.lstrip('/')
 
        repo_obj = Repository.get_by_repo_name(repo_name)
 
    else:
 
        repo_obj = None
 
        repo_name = u''
 

	
 
    user_log = UserLog()
 
    user_log.user_id = user_obj.user_id
 
    user_log.username = user_obj.username
 
    user_log.action = safe_unicode(action)
 

	
 
    user_log.repository = repo_obj
 
    user_log.repository_name = repo_name
 

	
 
    user_log.action_date = datetime.datetime.now()
 
    user_log.user_ip = ipaddr
 
    sa.add(user_log)
 

	
 
    log.info('Logging action:%s on %s by user:%s ip:%s',
 
             action, safe_unicode(repo), user_obj, ipaddr)
 
    if commit:
 
        sa.commit()
 

	
 

	
 
def get_filesystem_repos(path, recursive=False, skip_removed_repos=True):
 
def get_filesystem_repos(path):
 
    """
 
    Scans given path for repos and return (name,(type,path)) tuple
 

	
 
    :param path: path to scan for repositories
 
    :param recursive: recursive search and return names with subdirs in front
 
    """
 

	
 
    # remove ending slash for better results
 
    path = safe_str(path.rstrip(os.sep))
 
    log.debug('now scanning in %s location recursive:%s...', path, recursive)
 
    log.debug('now scanning in %s', path)
 

	
 
    def isdir(*n):
 
        return os.path.isdir(os.path.join(*n))
 

	
 
    def _get_repos(p):
 
        if not os.access(p, os.R_OK) or not os.access(p, os.X_OK):
 
            log.warning('ignoring repo path without access: %s', p)
 
            return
 
        if not os.access(p, os.W_OK):
 
            log.warning('repo path without write access: %s', p)
 
        for dirpath in os.listdir(p):
 
            if os.path.isfile(os.path.join(p, dirpath)):
 
                continue
 
            cur_path = os.path.join(p, dirpath)
 

	
 
    for root, dirs, _files in os.walk(path):
 
        recurse_dirs = []
 
        for subdir in dirs:
 
            # skip removed repos
 
            if skip_removed_repos and REMOVED_REPO_PAT.match(dirpath):
 
            if REMOVED_REPO_PAT.match(subdir):
 
                continue
 

	
 
            #skip .<something> dirs TODO: rly? then we should prevent creating them ...
 
            if dirpath.startswith('.'):
 
            if subdir.startswith('.'):
 
                continue
 

	
 
            try:
 
                scm_info = get_scm(cur_path)
 
                yield scm_info[1].split(path, 1)[-1].lstrip(os.sep), scm_info
 
            except VCSError:
 
                if not recursive:
 
            cur_path = os.path.join(root, subdir)
 
            if (isdir(cur_path, '.hg') or
 
                isdir(cur_path, '.git') or
 
                isdir(cur_path, '.svn') or
 
                isdir(cur_path, 'objects') and (isdir(cur_path, 'refs') or isfile(cur_path, 'packed-refs'))):
 

	
 
                if not os.access(cur_path, os.R_OK) or not os.access(cur_path, os.X_OK):
 
                    log.warning('ignoring repo path without access: %s', cur_path)
 
                    continue
 
                #check if this dir contains other repos for recursive scan
 
                rec_path = os.path.join(p, dirpath)
 
                if not os.path.islink(rec_path) and os.path.isdir(rec_path):
 
                    for inner_scm in _get_repos(rec_path):
 
                        yield inner_scm
 

	
 
                if not os.access(cur_path, os.W_OK):
 
                    log.warning('repo path without write access: %s', cur_path)
 

	
 
    return _get_repos(path)
 
                try:
 
                    scm_info = get_scm(cur_path)
 
                    assert cur_path.startswith(path)
 
                    repo_path = cur_path[len(path) + 1:]
 
                    yield repo_path, scm_info
 
                    continue # no recursion
 
                except VCSError:
 
                    # We should perhaps ignore such broken repos, but especially
 
                    # the bare git detection is unreliable so we dive into it
 
                    pass
 

	
 
            recurse_dirs.append(subdir)
 

	
 
        dirs[:] = recurse_dirs
 

	
 

	
 
def is_valid_repo(repo_name, base_path, scm=None):
 
    """
 
    Returns True if given path is a valid repository False otherwise.
 
    If scm param is given also compare if given scm is the same as expected
 
    from scm parameter
 

	
 
    :param repo_name:
 
    :param base_path:
 
    :param scm:
 

	
 
    :return True: if given path is a valid repository
 
    """
 
    full_path = os.path.join(safe_str(base_path), safe_str(repo_name))
 

	
 
    try:
 
        scm_ = get_scm(full_path)
 
        if scm:
 
            return scm_[0] == scm
 
        return True
 
    except VCSError:
 
        return False
 

	
 

	
 
def is_valid_repo_group(repo_group_name, base_path, skip_path_check=False):
 
    """
 
    Returns True if given path is a repository group False otherwise
 

	
 
    :param repo_name:
 
    :param base_path:
 
    """
 
    full_path = os.path.join(safe_str(base_path), safe_str(repo_group_name))
 

	
 
    # check if it's not a repo
 
    if is_valid_repo(repo_group_name, base_path):
 
        return False
 

	
 
    try:
 
        # we need to check bare git repos at higher level
 
        # since we might match branches/hooks/info/objects or possible
 
        # other things inside bare git repo
 
        get_scm(os.path.dirname(full_path))
 
        return False
 
    except VCSError:
 
        pass
 

	
 
    # check if it's a valid path
kallithea/model/scm.py
Show inline comments
 
@@ -146,97 +146,97 @@ class UserGroupList(_PermCheckIterator):
 
                    obj_attr='users_group_name', perm_set=perm_set,
 
                    perm_checker=HasUserGroupPermissionAny,
 
                    extra_kwargs=extra_kwargs)
 

	
 

	
 
class ScmModel(BaseModel):
 
    """
 
    Generic Scm Model
 
    """
 

	
 
    def __get_repo(self, instance):
 
        cls = Repository
 
        if isinstance(instance, cls):
 
            return instance
 
        elif isinstance(instance, int) or safe_str(instance).isdigit():
 
            return cls.get(instance)
 
        elif isinstance(instance, basestring):
 
            return cls.get_by_repo_name(instance)
 
        elif instance is not None:
 
            raise Exception('given object must be int, basestr or Instance'
 
                            ' of %s got %s' % (type(cls), type(instance)))
 

	
 
    @LazyProperty
 
    def repos_path(self):
 
        """
 
        Gets the repositories root path from database
 
        """
 

	
 
        q = self.sa.query(Ui).filter(Ui.ui_key == '/').one()
 

	
 
        return q.ui_value
 

	
 
    def repo_scan(self, repos_path=None):
 
        """
 
        Listing of repositories in given path. This path should not be a
 
        repository itself. Return a dictionary of repository objects
 

	
 
        :param repos_path: path to directory containing repositories
 
        """
 

	
 
        if repos_path is None:
 
            repos_path = self.repos_path
 

	
 
        log.info('scanning for repositories in %s', repos_path)
 

	
 
        baseui = make_ui('db')
 
        repos = {}
 

	
 
        for name, path in get_filesystem_repos(repos_path, recursive=True):
 
        for name, path in get_filesystem_repos(repos_path):
 
            # name need to be decomposed and put back together using the /
 
            # since this is internal storage separator for kallithea
 
            name = Repository.normalize_repo_name(name)
 

	
 
            try:
 
                if name in repos:
 
                    raise RepositoryError('Duplicate repository name %s '
 
                                          'found in %s' % (name, path))
 
                else:
 

	
 
                    klass = get_backend(path[0])
 

	
 
                    if path[0] == 'hg' and path[0] in BACKENDS.keys():
 
                        repos[name] = klass(safe_str(path[1]), baseui=baseui)
 

	
 
                    if path[0] == 'git' and path[0] in BACKENDS.keys():
 
                        repos[name] = klass(path[1])
 
            except OSError:
 
                continue
 
        log.debug('found %s paths with repositories', len(repos))
 
        return repos
 

	
 
    def get_repos(self, repos):
 
        """Return the repos the user has access to"""
 
        return RepoList(repos)
 

	
 
    def get_repo_groups(self, groups=None):
 
        """Return the repo groups the user has access to
 
        If no groups are specified, use top level groups.
 
        """
 
        if groups is None:
 
            groups = RepoGroup.query() \
 
                .filter(RepoGroup.group_parent_id == None).all()
 
        return RepoGroupList(groups)
 

	
 
    def mark_for_invalidation(self, repo_name):
 
        """
 
        Mark caches of this repo invalid in the database.
 

	
 
        :param repo_name: the repo for which caches should be marked invalid
 
        """
 
        CacheInvalidation.set_invalidate(repo_name)
 
        repo = Repository.get_by_repo_name(repo_name)
 
        if repo is not None:
 
            repo.update_changeset_cache()
 

	
 
    def toggle_following_repo(self, follow_repo_id, user_id):
 

	
0 comments (0 inline, 0 general)