Changeset - 95a502d94860
[Not reviewed]
default
0 2 0
Marcin Kuzminski - 15 years ago 2010-10-18 01:14:40
marcin@python-works.com
removed soon deprecated walk method on repository instance
2 files changed with 4 insertions and 2 deletions:
0 comments (0 inline, 0 general)
rhodecode/lib/celerylib/tasks.py
Show inline comments
 
from celery.decorators import task
 

	
 
from operator import itemgetter
 
from pylons.i18n.translation import _
 
from rhodecode.lib.celerylib import run_task, locked_task
 
from rhodecode.lib.helpers import person
 
from rhodecode.lib.smtp_mailer import SmtpMailer
 
from rhodecode.lib.utils import OrderedDict
 
from time import mktime
 
from vcs.backends.hg import MercurialRepository
 
import traceback
 

	
 
try:
 
    import json
 
except ImportError:
 
    #python 2.5 compatibility
 
    import simplejson as json
 

	
 
try:
 
    from celeryconfig import PYLONS_CONFIG as config
 
    celery_on = True
 
except ImportError:
 
    #if celeryconfig is not present let's just load our pylons
 
    #config instead
 
    from pylons import config
 
    celery_on = False
 

	
 

	
 
__all__ = ['whoosh_index', 'get_commits_stats',
 
           'reset_user_password', 'send_email']
 

	
 
def get_session():
 
    if celery_on:
 
        from sqlalchemy import engine_from_config
 
        from sqlalchemy.orm import sessionmaker, scoped_session
 
        engine = engine_from_config(dict(config.items('app:main')), 'sqlalchemy.db1.')
 
        sa = scoped_session(sessionmaker(bind=engine))
 
    else:
 
        #If we don't use celery reuse our current application Session
 
        from rhodecode.model.meta import Session
 
        sa = Session
 
        
 
    return sa
 

	
 
def get_hg_settings():
 
    from rhodecode.model.db import RhodeCodeSettings
 
    sa = get_session()
 
    ret = sa.query(RhodeCodeSettings).all()
 
        
 
    if not ret:
 
        raise Exception('Could not get application settings !')
 
    settings = {}
 
    for each in ret:
 
        settings['rhodecode_' + each.app_settings_name] = each.app_settings_value    
 
    
 
    return settings
 

	
 
def get_hg_ui_settings():
 
    from rhodecode.model.db import RhodeCodeUi
 
    sa = get_session()
 
    ret = sa.query(RhodeCodeUi).all()
 
        
 
    if not ret:
 
        raise Exception('Could not get application ui settings !')
 
    settings = {}
 
    for each in ret:
 
        k = each.ui_key
 
        v = each.ui_value
 
        if k == '/':
 
            k = 'root_path'
 
        
 
        if k.find('.') != -1:
 
            k = k.replace('.', '_')
 
        
 
        if each.ui_section == 'hooks':
 
            v = each.ui_active
 
        
 
        settings[each.ui_section + '_' + k] = v  
 
    
 
    return settings   
 

	
 
@task
 
@locked_task
 
def whoosh_index(repo_location, full_index):
 
    log = whoosh_index.get_logger()
 
    from rhodecode.lib.indexers.daemon import WhooshIndexingDaemon
 
    WhooshIndexingDaemon(repo_location=repo_location).run(full_index=full_index)
 

	
 
@task
 
@locked_task
 
def get_commits_stats(repo_name, ts_min_y, ts_max_y):
 
    from rhodecode.model.db import Statistics, Repository
 
    log = get_commits_stats.get_logger()
 
    author_key_cleaner = lambda k: person(k).replace('"', "") #for js data compatibilty
 
    
 
    commits_by_day_author_aggregate = {}
 
    commits_by_day_aggregate = {}
 
    repos_path = get_hg_ui_settings()['paths_root_path'].replace('*', '')
 
    repo = MercurialRepository(repos_path + repo_name)
 

	
 
    skip_date_limit = True
 
    parse_limit = 350 #limit for single task changeset parsing optimal for
 
    last_rev = 0
 
    last_cs = None
 
    timegetter = itemgetter('time')
 
    
 
    sa = get_session()
 
    
 
    dbrepo = sa.query(Repository)\
 
        .filter(Repository.repo_name == repo_name).scalar()
 
    cur_stats = sa.query(Statistics)\
 
        .filter(Statistics.repository == dbrepo).scalar()
 
    if cur_stats:
 
        last_rev = cur_stats.stat_on_revision
 
    if not repo.revisions:
 
        return True
 
    
 
    if last_rev == repo.revisions[-1] and len(repo.revisions) > 1:
 
        #pass silently without any work if we're not on first revision or current
 
        #state of parsing revision(from db marker) is the last revision
 
        return True
 
    
 
    if cur_stats:
 
        commits_by_day_aggregate = OrderedDict(
 
                                       json.loads(
 
                                        cur_stats.commit_activity_combined))
 
        commits_by_day_author_aggregate = json.loads(cur_stats.commit_activity)
 
    
 
    log.debug('starting parsing %s', parse_limit)
 
    for cnt, rev in enumerate(repo.revisions[last_rev:]):
 
        last_cs = cs = repo.get_changeset(rev)
 
        k = '%s-%s-%s' % (cs.date.timetuple()[0], cs.date.timetuple()[1],
 
                          cs.date.timetuple()[2])
 
        timetupple = [int(x) for x in k.split('-')]
 
        timetupple.extend([0 for _ in xrange(6)])
 
        k = mktime(timetupple)
 
        if commits_by_day_author_aggregate.has_key(author_key_cleaner(cs.author)):
 
            try:
 
                l = [timegetter(x) for x in commits_by_day_author_aggregate\
 
                        [author_key_cleaner(cs.author)]['data']]
 
                time_pos = l.index(k)
 
            except ValueError:
 
                time_pos = False
 
                
 
            if time_pos >= 0 and time_pos is not False:
 
                
 
                datadict = commits_by_day_author_aggregate\
 
                    [author_key_cleaner(cs.author)]['data'][time_pos]
 
                
 
                datadict["commits"] += 1
 
                datadict["added"] += len(cs.added)
 
                datadict["changed"] += len(cs.changed)
 
                datadict["removed"] += len(cs.removed)
 
                
 
            else:
 
                if k >= ts_min_y and k <= ts_max_y or skip_date_limit:
 
                    
 
                    datadict = {"time":k,
 
                                "commits":1,
 
                                "added":len(cs.added),
 
                                "changed":len(cs.changed),
 
                                "removed":len(cs.removed),
 
                               }
 
                    commits_by_day_author_aggregate\
 
                        [author_key_cleaner(cs.author)]['data'].append(datadict)
 
                                        
 
        else:
 
            if k >= ts_min_y and k <= ts_max_y or skip_date_limit:
 
                commits_by_day_author_aggregate[author_key_cleaner(cs.author)] = {
 
                                    "label":author_key_cleaner(cs.author),
 
                                    "data":[{"time":k,
 
                                             "commits":1,
 
                                             "added":len(cs.added),
 
                                             "changed":len(cs.changed),
 
                                             "removed":len(cs.removed),
 
                                             }],
 
                                    "schema":["commits"],
 
                                    }               
 
    
 
        #gather all data by day
 
        if commits_by_day_aggregate.has_key(k):
 
            commits_by_day_aggregate[k] += 1
 
        else:
 
            commits_by_day_aggregate[k] = 1
 
        
 
        if cnt >= parse_limit:
 
            #don't fetch to much data since we can freeze application
 
            break
 

	
 
    overview_data = []
 
    for k, v in commits_by_day_aggregate.items():
 
        overview_data.append([k, v])
 
    overview_data = sorted(overview_data, key=itemgetter(0))
 
        
 
    if not commits_by_day_author_aggregate:
 
        commits_by_day_author_aggregate[author_key_cleaner(repo.contact)] = {
 
            "label":author_key_cleaner(repo.contact),
 
            "data":[0, 1],
 
            "schema":["commits"],
 
        }
 

	
 
    stats = cur_stats if cur_stats else Statistics()
 
    stats.commit_activity = json.dumps(commits_by_day_author_aggregate)
 
    stats.commit_activity_combined = json.dumps(overview_data)
 

	
 
    log.debug('last revison %s', last_rev)
 
    leftovers = len(repo.revisions[last_rev:])
 
    log.debug('revisions to parse %s', leftovers)
 
    
 
    if last_rev == 0 or leftovers < parse_limit:    
 
        stats.languages = json.dumps(__get_codes_stats(repo_name))
 
        
 
    stats.repository = dbrepo
 
    stats.stat_on_revision = last_cs.revision
 
    
 
    try:
 
        sa.add(stats)
 
        sa.commit()    
 
    except:
 
        log.error(traceback.format_exc())
 
        sa.rollback()
 
        return False
 
    if len(repo.revisions) > 1:
 
        run_task(get_commits_stats, repo_name, ts_min_y, ts_max_y)
 
                            
 
    return True
 

	
 
@task
 
def reset_user_password(user_email):
 
    log = reset_user_password.get_logger()
 
    from rhodecode.lib import auth
 
    from rhodecode.model.db import User
 
    
 
    try:
 
        try:
 
            sa = get_session()
 
            user = sa.query(User).filter(User.email == user_email).scalar()
 
            new_passwd = auth.PasswordGenerator().gen_password(8,
 
                             auth.PasswordGenerator.ALPHABETS_BIG_SMALL)
 
            if user:
 
                user.password = auth.get_crypt_password(new_passwd)
 
                sa.add(user)
 
                sa.commit()
 
                log.info('change password for %s', user_email)
 
            if new_passwd is None:
 
                raise Exception('unable to generate new password')
 
            
 
        except:
 
            log.error(traceback.format_exc())
 
            sa.rollback()
 
        
 
        run_task(send_email, user_email,
 
                 "Your new rhodecode password",
 
                 'Your new rhodecode password:%s' % (new_passwd))
 
        log.info('send new password mail to %s', user_email)
 
        
 
        
 
    except:
 
        log.error('Failed to update user password')
 
        log.error(traceback.format_exc())
 
    return True
 

	
 
@task    
 
def send_email(recipients, subject, body):
 
    log = send_email.get_logger()
 
    email_config = dict(config.items('DEFAULT')) 
 
    mail_from = email_config.get('app_email_from')
 
    user = email_config.get('smtp_username')
 
    passwd = email_config.get('smtp_password')
 
    mail_server = email_config.get('smtp_server')
 
    mail_port = email_config.get('smtp_port')
 
    tls = email_config.get('smtp_use_tls')
 
    ssl = False
 
    
 
    try:
 
        m = SmtpMailer(mail_from, user, passwd, mail_server,
 
                       mail_port, ssl, tls)
 
        m.send(recipients, subject, body)  
 
    except:
 
        log.error('Mail sending failed')
 
        log.error(traceback.format_exc())
 
        return False
 
    return True
 

	
 
@task
 
def create_repo_fork(form_data, cur_user):
 
    import os
 
    from rhodecode.model.repo_model import RepoModel
 
    sa = get_session()
 
    rm = RepoModel(sa)
 
    
 
    rm.create(form_data, cur_user, just_db=True, fork=True)
 
    
 
    repos_path = get_hg_ui_settings()['paths_root_path'].replace('*', '')
 
    repo_path = os.path.join(repos_path, form_data['repo_name'])
 
    repo_fork_path = os.path.join(repos_path, form_data['fork_name'])
 
    
 
    MercurialRepository(str(repo_fork_path), True, clone_url=str(repo_path))
 

	
 
    
 
def __get_codes_stats(repo_name):
 
    LANGUAGES_EXTENSIONS = ['action', 'adp', 'ashx', 'asmx', 'aspx', 'asx', 'axd', 'c',
 
                    'cfg', 'cfm', 'cpp', 'cs', 'diff', 'do', 'el', 'erl',
 
                    'h', 'java', 'js', 'jsp', 'jspx', 'lisp',
 
                    'lua', 'm', 'mako', 'ml', 'pas', 'patch', 'php', 'php3',
 
                    'php4', 'phtml', 'pm', 'py', 'rb', 'rst', 's', 'sh',
 
                    'tpl', 'txt', 'vim', 'wss', 'xhtml', 'xml', 'xsl', 'xslt',
 
                    'yaws']
 
    repos_path = get_hg_ui_settings()['paths_root_path'].replace('*', '')
 
    repo = MercurialRepository(repos_path + repo_name)
 
    tip = repo.get_changeset()
 

	
 
    code_stats = {}
 
    for topnode, dirs, files in repo.walk('/', 'tip'):
 
    for topnode, dirs, files in tip.walk('/'):
 
        for f in files:
 
            k = f.mimetype
 
            if f.extension in LANGUAGES_EXTENSIONS:
 
                if code_stats.has_key(k):
 
                    code_stats[k] += 1
 
                else:
 
                    code_stats[k] = 1
 
                    
 
    return code_stats or {}
 

	
 

	
 
            
 

	
 

	
rhodecode/lib/indexers/daemon.py
Show inline comments
 
#!/usr/bin/env python
 
# encoding: utf-8
 
# whoosh indexer daemon for rhodecode
 
# Copyright (C) 2009-2010 Marcin Kuzminski <marcin@python-works.com>
 
#
 
# This program is free software; you can redistribute it and/or
 
# modify it under the terms of the GNU General Public License
 
# as published by the Free Software Foundation; version 2
 
# of the License or (at your opinion) any later version of the license.
 
# 
 
# This program is distributed in the hope that it will be useful,
 
# but WITHOUT ANY WARRANTY; without even the implied warranty of
 
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 
# GNU General Public License for more details.
 
# 
 
# You should have received a copy of the GNU General Public License
 
# along with this program; if not, write to the Free Software
 
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
 
# MA  02110-1301, USA.
 
"""
 
Created on Jan 26, 2010
 

	
 
@author: marcink
 
A deamon will read from task table and run tasks
 
"""
 
import sys
 
import os
 
from os.path import dirname as dn
 
from os.path import join as jn
 

	
 
#to get the rhodecode import
 
project_path = dn(dn(dn(dn(os.path.realpath(__file__)))))
 
sys.path.append(project_path)
 

	
 
from rhodecode.lib.pidlock import LockHeld, DaemonLock
 
from rhodecode.model.hg_model import HgModel
 
from rhodecode.lib.helpers import safe_unicode
 
from whoosh.index import create_in, open_dir
 
from shutil import rmtree
 
from rhodecode.lib.indexers import INDEX_EXTENSIONS, IDX_LOCATION, SCHEMA, IDX_NAME
 

	
 
from time import mktime
 
from vcs.exceptions import ChangesetError, RepositoryError
 

	
 
import logging
 

	
 
log = logging.getLogger('whooshIndexer')
 
# create logger
 
log.setLevel(logging.DEBUG)
 
log.propagate = False
 
# create console handler and set level to debug
 
ch = logging.StreamHandler()
 
ch.setLevel(logging.DEBUG)
 

	
 
# create formatter
 
formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
 

	
 
# add formatter to ch
 
ch.setFormatter(formatter)
 

	
 
# add ch to logger
 
log.addHandler(ch)
 

	
 
def scan_paths(root_location):
 
    return HgModel.repo_scan('/', root_location, None, True)
 

	
 
class WhooshIndexingDaemon(object):
 
    """
 
    Deamon for atomic jobs
 
    """
 

	
 
    def __init__(self, indexname='HG_INDEX', repo_location=None):
 
        self.indexname = indexname
 
        self.repo_location = repo_location
 
        self.repo_paths = scan_paths(self.repo_location)
 
        self.initial = False
 
        if not os.path.isdir(IDX_LOCATION):
 
            os.mkdir(IDX_LOCATION)
 
            log.info('Cannot run incremental index since it does not'
 
                     ' yet exist running full build')
 
            self.initial = True
 
        
 
    def get_paths(self, repo):
 
        """
 
        recursive walk in root dir and return a set of all path in that dir
 
        based on repository walk function
 
        """
 
        index_paths_ = set()
 
        tip = repo.get_changeset()
 
        try:
 
            for topnode, dirs, files in repo.walk('/', 'tip'):
 
            for topnode, dirs, files in tip.walk('/'):
 
                for f in files:
 
                    index_paths_.add(jn(repo.path, f.path))
 
                for dir in dirs:
 
                    for f in files:
 
                        index_paths_.add(jn(repo.path, f.path))
 
                
 
        except RepositoryError:
 
            pass
 
        return index_paths_        
 
    
 
    def get_node(self, repo, path):
 
        n_path = path[len(repo.path) + 1:]
 
        node = repo.get_changeset().get_node(n_path)
 
        return node
 
    
 
    def get_node_mtime(self, node):
 
        return mktime(node.last_changeset.date.timetuple())
 
    
 
    def add_doc(self, writer, path, repo):
 
        """Adding doc to writer"""
 
        node = self.get_node(repo, path)
 

	
 
        #we just index the content of chosen files
 
        if node.extension in INDEX_EXTENSIONS:
 
            log.debug('    >> %s [WITH CONTENT]' % path)
 
            u_content = node.content
 
        else:
 
            log.debug('    >> %s' % path)
 
            #just index file name without it's content
 
            u_content = u''
 
        
 
        writer.add_document(owner=unicode(repo.contact),
 
                        repository=safe_unicode(repo.name),
 
                        path=safe_unicode(path),
 
                        content=u_content,
 
                        modtime=self.get_node_mtime(node),
 
                        extension=node.extension)             
 

	
 
    
 
    def build_index(self):
 
        if os.path.exists(IDX_LOCATION):
 
            log.debug('removing previous index')
 
            rmtree(IDX_LOCATION)
 
            
 
        if not os.path.exists(IDX_LOCATION):
 
            os.mkdir(IDX_LOCATION)
 
        
 
        idx = create_in(IDX_LOCATION, SCHEMA, indexname=IDX_NAME)
 
        writer = idx.writer()
 
        
 
        for cnt, repo in enumerate(self.repo_paths.values()):
 
            log.debug('building index @ %s' % repo.path)
 
        
 
            for idx_path in self.get_paths(repo):
 
                self.add_doc(writer, idx_path, repo)
 
        
 
        log.debug('>> COMMITING CHANGES <<')
 
        writer.commit(merge=True)
 
        log.debug('>>> FINISHED BUILDING INDEX <<<')
 
            
 
    
 
    def update_index(self):
 
        log.debug('STARTING INCREMENTAL INDEXING UPDATE')
 
            
 
        idx = open_dir(IDX_LOCATION, indexname=self.indexname)
 
        # The set of all paths in the index
 
        indexed_paths = set()
 
        # The set of all paths we need to re-index
 
        to_index = set()
 
        
 
        reader = idx.reader()
 
        writer = idx.writer()
 
    
 
        # Loop over the stored fields in the index
 
        for fields in reader.all_stored_fields():
 
            indexed_path = fields['path']
 
            indexed_paths.add(indexed_path)
 
            
 
            repo = self.repo_paths[fields['repository']]
 
            
 
            try:
 
                node = self.get_node(repo, indexed_path)
 
            except ChangesetError:
 
                # This file was deleted since it was indexed
 
                log.debug('removing from index %s' % indexed_path)
 
                writer.delete_by_term('path', indexed_path)
 
    
 
            else:
 
                # Check if this file was changed since it was indexed
 
                indexed_time = fields['modtime']
 
                mtime = self.get_node_mtime(node)
 
                if mtime > indexed_time:
 
                    # The file has changed, delete it and add it to the list of
 
                    # files to reindex
 
                    log.debug('adding to reindex list %s' % indexed_path)
 
                    writer.delete_by_term('path', indexed_path)
 
                    to_index.add(indexed_path)
 
    
 
        # Loop over the files in the filesystem
 
        # Assume we have a function that gathers the filenames of the
 
        # documents to be indexed
 
        for repo in self.repo_paths.values():
 
            for path in self.get_paths(repo):
 
                if path in to_index or path not in indexed_paths:
 
                    # This is either a file that's changed, or a new file
 
                    # that wasn't indexed before. So index it!
 
                    self.add_doc(writer, path, repo)
 
                    log.debug('re indexing %s' % path)
 
                    
 
        log.debug('>> COMMITING CHANGES <<')
 
        writer.commit(merge=True)
 
        log.debug('>>> FINISHED REBUILDING INDEX <<<')
 
        
 
    def run(self, full_index=False):
 
        """Run daemon"""
 
        if full_index or self.initial:
 
            self.build_index()
 
        else:
 
            self.update_index()
 
        
 
if __name__ == "__main__":
 
    arg = sys.argv[1:]
 
    if len(arg) != 2:
 
        sys.stderr.write('Please specify indexing type [full|incremental]' 
 
                         'and path to repositories as script args \n')
 
        sys.exit()
 
    
 
    
 
    if arg[0] == 'full':
 
        full_index = True
 
    elif arg[0] == 'incremental':
 
        # False means looking just for changes
 
        full_index = False
 
    else:
 
        sys.stdout.write('Please use [full|incremental]' 
 
                         ' as script first arg \n')
 
        sys.exit()
 
    
 
    if not os.path.isdir(arg[1]):
 
        sys.stderr.write('%s is not a valid path \n' % arg[1])
 
        sys.exit()
 
    else:
 
        if arg[1].endswith('/'):
 
            repo_location = arg[1] + '*'
 
        else:
 
            repo_location = arg[1] + '/*'
 
    
 
    try:
 
        l = DaemonLock()
 
        WhooshIndexingDaemon(repo_location=repo_location)\
 
            .run(full_index=full_index)
 
        l.release()
 
        reload(logging)
 
    except LockHeld:
 
        sys.exit(1)
 

	
0 comments (0 inline, 0 general)