Changeset - 5f21a9dcb09d
[Not reviewed]
beta
0 6 0
Indra Talip - 13 years ago 2012-07-20 12:50:56
indra.talip@gmail.com
create an index for commit messages and the ability to search them and see results
6 files changed with 346 insertions and 116 deletions:
0 comments (0 inline, 0 general)
rhodecode/controllers/search.py
Show inline comments
 
# -*- coding: utf-8 -*-
 
"""
 
    rhodecode.controllers.search
 
    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 

	
 
    Search controller for rhodecode
 

	
 
    :created_on: Aug 7, 2010
 
    :author: marcink
 
    :copyright: (C) 2010-2012 Marcin Kuzminski <marcin@python-works.com>
 
    :license: GPLv3, see COPYING for more details.
 
"""
 
# This program is free software: you can redistribute it and/or modify
 
# it under the terms of the GNU General Public License as published by
 
# the Free Software Foundation, either version 3 of the License, or
 
# (at your option) any later version.
 
#
 
# This program is distributed in the hope that it will be useful,
 
# but WITHOUT ANY WARRANTY; without even the implied warranty of
 
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 
# GNU General Public License for more details.
 
#
 
# You should have received a copy of the GNU General Public License
 
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
import logging
 
import traceback
 

	
 
from pylons.i18n.translation import _
 
from pylons import request, config, tmpl_context as c
 

	
 
from rhodecode.lib.auth import LoginRequired
 
from rhodecode.lib.base import BaseController, render
 
from rhodecode.lib.indexers import SCHEMA, IDX_NAME, WhooshResultWrapper
 
from rhodecode.lib.indexers import CHGSETS_SCHEMA, SCHEMA, CHGSET_IDX_NAME, IDX_NAME, WhooshResultWrapper
 

	
 
from webhelpers.paginate import Page
 
from webhelpers.util import update_params
 

	
 
from whoosh.index import open_dir, EmptyIndexError
 
from whoosh.qparser import QueryParser, QueryParserError
 
from whoosh.query import Phrase, Wildcard, Term, Prefix
 
from rhodecode.model.repo import RepoModel
 

	
 
log = logging.getLogger(__name__)
 

	
 

	
 
class SearchController(BaseController):
 

	
 
    @LoginRequired()
 
    def __before__(self):
 
        super(SearchController, self).__before__()
 

	
 
    def index(self, search_repo=None):
 
        c.repo_name = search_repo
 
        c.formated_results = []
 
        c.runtime = ''
 
        c.cur_query = request.GET.get('q', None)
 
        c.cur_type = request.GET.get('type', 'source')
 
        c.cur_type = request.GET.get('type', 'content')
 
        c.cur_search = search_type = {'content': 'content',
 
                                      'commit': 'content',
 
                                      'commit': 'message',
 
                                      'path': 'path',
 
                                      'repository': 'repository'}\
 
                                      .get(c.cur_type, 'content')
 

	
 
        index_name = {
 
            'content': IDX_NAME,
 
            'commit': CHGSET_IDX_NAME,
 
            'path': IDX_NAME}\
 
            .get(c.cur_type, IDX_NAME)
 

	
 
        schema_defn = {
 
            'content': SCHEMA,
 
            'commit': CHGSETS_SCHEMA,
 
            'path': SCHEMA}\
 
            .get(c.cur_type, SCHEMA)
 

	
 
        log.debug('IDX: %s' % index_name)
 
        log.debug('SCHEMA: %s' % schema_defn)
 

	
 
        if c.cur_query:
 
            cur_query = c.cur_query.lower()
 
            log.debug(cur_query)
 

	
 
        if c.cur_query:
 
            p = int(request.params.get('page', 1))
 
            highlight_items = set()
 
            try:
 
                idx = open_dir(config['app_conf']['index_dir'],
 
                               indexname=IDX_NAME)
 
                               indexname=index_name)
 
                searcher = idx.searcher()
 

	
 
                qp = QueryParser(search_type, schema=SCHEMA)
 
                qp = QueryParser(search_type, schema=schema_defn)
 
                if c.repo_name:
 
                    cur_query = u'repository:%s %s' % (c.repo_name, cur_query)
 
                try:
 
                    query = qp.parse(unicode(cur_query))
 
                    # extract words for highlight
 
                    if isinstance(query, Phrase):
 
                        highlight_items.update(query.words)
 
                    elif isinstance(query, Prefix):
 
                        highlight_items.add(query.text)
 
                    else:
 
                        for i in query.all_terms():
 
                            if i[0] == 'content':
 
                            if i[0] in ['content', 'message']:
 
                                highlight_items.add(i[1])
 

	
 
                    matcher = query.matcher(searcher)
 

	
 
                    log.debug(query)
 
                    log.debug(highlight_items)
 
                    log.debug('query: %s' % query)
 
                    log.debug('hl terms: %s' % highlight_items)
 
                    results = searcher.search(query)
 
                    res_ln = len(results)
 
                    c.runtime = '%s results (%.3f seconds)' % (
 
                        res_ln, results.runtime
 
                    )
 

	
 
                    def url_generator(**kw):
 
                        return update_params("?q=%s&type=%s" \
 
                                           % (c.cur_query, c.cur_search), **kw)
 
                                           % (c.cur_query, c.cur_type), **kw)
 
                    repo_location = RepoModel().repos_path
 
                    c.formated_results = Page(
 
                        WhooshResultWrapper(search_type, searcher, matcher,
 
                                            highlight_items, repo_location),
 
                        page=p,
 
                        item_count=res_ln,
 
                        items_per_page=10,
 
                        url=url_generator
 
                    )
 

	
 
                except QueryParserError:
 
                    c.runtime = _('Invalid search query. Try quoting it.')
 
                searcher.close()
 
            except (EmptyIndexError, IOError):
 
                log.error(traceback.format_exc())
 
                log.error('Empty Index data')
 
                c.runtime = _('There is no index to search in. '
 
                              'Please run whoosh indexer')
 
            except (Exception):
 
                log.error(traceback.format_exc())
 
                c.runtime = _('An error occurred during this search operation')
 

	
 

	
 
        # Return a rendered template
 
        return render('/search/search.html')
rhodecode/lib/indexers/__init__.py
Show inline comments
 
# -*- coding: utf-8 -*-
 
"""
 
    rhodecode.lib.indexers.__init__
 
    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 

	
 
    Whoosh indexing module for RhodeCode
 

	
 
    :created_on: Aug 17, 2010
 
    :author: marcink
 
    :copyright: (C) 2010-2012 Marcin Kuzminski <marcin@python-works.com>
 
    :license: GPLv3, see COPYING for more details.
 
"""
 
# This program is free software: you can redistribute it and/or modify
 
# it under the terms of the GNU General Public License as published by
 
# the Free Software Foundation, either version 3 of the License, or
 
# (at your option) any later version.
 
#
 
# This program is distributed in the hope that it will be useful,
 
# but WITHOUT ANY WARRANTY; without even the implied warranty of
 
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 
# GNU General Public License for more details.
 
#
 
# You should have received a copy of the GNU General Public License
 
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
import os
 
import sys
 
import traceback
 
import logging
 
from os.path import dirname as dn, join as jn
 

	
 
#to get the rhodecode import
 
sys.path.append(dn(dn(dn(os.path.realpath(__file__)))))
 

	
 
from string import strip
 
from shutil import rmtree
 

	
 
from whoosh.analysis import RegexTokenizer, LowercaseFilter, StopFilter
 
from whoosh.fields import TEXT, ID, STORED, Schema, FieldType
 
from whoosh.fields import TEXT, ID, STORED, NUMERIC, BOOLEAN, Schema, FieldType
 
from whoosh.index import create_in, open_dir
 
from whoosh.formats import Characters
 
from whoosh.highlight import highlight, HtmlFormatter, ContextFragmenter
 

	
 
from webhelpers.html.builder import escape, literal
 
from sqlalchemy import engine_from_config
 

	
 
from rhodecode.model import init_model
 
from rhodecode.model.scm import ScmModel
 
from rhodecode.model.repo import RepoModel
 
from rhodecode.config.environment import load_environment
 
from rhodecode.lib.utils2 import LazyProperty
 
from rhodecode.lib.utils import BasePasterCommand, Command, add_cache,\
 
    load_rcextensions
 

	
 
log = logging.getLogger(__name__)
 

	
 
# CUSTOM ANALYZER wordsplit + lowercase filter
 
ANALYZER = RegexTokenizer(expression=r"\w+") | LowercaseFilter()
 

	
 

	
 
#INDEX SCHEMA DEFINITION
 
SCHEMA = Schema(
 
    fileid=ID(unique=True),
 
    owner=TEXT(),
 
    repository=TEXT(stored=True),
 
    path=TEXT(stored=True),
 
    content=FieldType(format=Characters(), analyzer=ANALYZER,
 
                      scorable=True, stored=True),
 
    modtime=STORED(),
 
    extension=TEXT(stored=True)
 
)
 

	
 
IDX_NAME = 'HG_INDEX'
 
FORMATTER = HtmlFormatter('span', between='\n<span class="break">...</span>\n')
 
FRAGMENTER = ContextFragmenter(200)
 

	
 
CHGSETS_SCHEMA = Schema(
 
    path=ID(unique=True, stored=True),
 
    revision=NUMERIC(unique=True, stored=True),
 
    last=BOOLEAN(),
 
    owner=TEXT(),
 
    repository=ID(unique=True, stored=True),
 
    author=TEXT(stored=True),
 
    message=FieldType(format=Characters(), analyzer=ANALYZER,
 
                      scorable=True, stored=True),
 
    parents=TEXT(),
 
    added=TEXT(),
 
    removed=TEXT(),
 
    changed=TEXT(),
 
)
 

	
 
CHGSET_IDX_NAME = 'CHGSET_INDEX'
 

	
 
class MakeIndex(BasePasterCommand):
 

	
 
    max_args = 1
 
    min_args = 1
 

	
 
    usage = "CONFIG_FILE"
 
    summary = "Creates index for full text search given configuration file"
 
    group_name = "RhodeCode"
 
    takes_config_file = -1
 
    parser = Command.standard_parser(verbose=True)
 

	
 
    def command(self):
 
        logging.config.fileConfig(self.path_to_ini_file)
 
        from pylons import config
 
        add_cache(config)
 
        engine = engine_from_config(config, 'sqlalchemy.db1.')
 
        init_model(engine)
 
        index_location = config['index_dir']
 
        repo_location = self.options.repo_location \
 
            if self.options.repo_location else RepoModel().repos_path
 
        repo_list = map(strip, self.options.repo_list.split(',')) \
 
            if self.options.repo_list else None
 
        repo_update_list = map(strip, self.options.repo_update_list.split(',')) \
 
            if self.options.repo_update_list else None
 
        load_rcextensions(config['here'])
 
        #======================================================================
 
        # WHOOSH DAEMON
 
        #======================================================================
 
        from rhodecode.lib.pidlock import LockHeld, DaemonLock
 
        from rhodecode.lib.indexers.daemon import WhooshIndexingDaemon
 
        try:
 
            l = DaemonLock(file_=jn(dn(dn(index_location)), 'make_index.lock'))
 
            WhooshIndexingDaemon(index_location=index_location,
 
                                 repo_location=repo_location,
 
                                 repo_list=repo_list,
 
                                 repo_update_list=repo_update_list)\
 
                .run(full_index=self.options.full_index)
 
            l.release()
 
        except LockHeld:
 
            sys.exit(1)
 

	
 
    def update_parser(self):
 
        self.parser.add_option('--repo-location',
 
                          action='store',
 
                          dest='repo_location',
 
                          help="Specifies repositories location to index OPTIONAL",
 
                          )
 
@@ -146,96 +163,103 @@ class WhooshResultWrapper(object):
 
        self.search_type = search_type
 
        self.searcher = searcher
 
        self.matcher = matcher
 
        self.highlight_items = highlight_items
 
        self.fragment_size = 200
 
        self.repo_location = repo_location
 

	
 
    @LazyProperty
 
    def doc_ids(self):
 
        docs_id = []
 
        while self.matcher.is_active():
 
            docnum = self.matcher.id()
 
            chunks = [offsets for offsets in self.get_chunks()]
 
            docs_id.append([docnum, chunks])
 
            self.matcher.next()
 
        return docs_id
 

	
 
    def __str__(self):
 
        return '<%s at %s>' % (self.__class__.__name__, len(self.doc_ids))
 

	
 
    def __repr__(self):
 
        return self.__str__()
 

	
 
    def __len__(self):
 
        return len(self.doc_ids)
 

	
 
    def __iter__(self):
 
        """
 
        Allows Iteration over results,and lazy generate content
 

	
 
        *Requires* implementation of ``__getitem__`` method.
 
        """
 
        for docid in self.doc_ids:
 
            yield self.get_full_content(docid)
 

	
 
    def __getitem__(self, key):
 
        """
 
        Slicing of resultWrapper
 
        """
 
        i, j = key.start, key.stop
 

	
 
        slices = []
 
        for docid in self.doc_ids[i:j]:
 
            slices.append(self.get_full_content(docid))
 
        return slices
 

	
 
    def get_full_content(self, docid):
 
        res = self.searcher.stored_fields(docid[0])
 
        log.debug('result: %s' % res)
 
        full_repo_path = jn(self.repo_location, res['repository'])
 
        f_path = res['path'].split(full_repo_path)[-1]
 
        f_path = f_path.lstrip(os.sep)
 
        res.update({'f_path': f_path})
 

	
 
        content_short = self.get_short_content(res, docid[1])
 
        res.update({'content_short': content_short,
 
                    'content_short_hl': self.highlight(content_short),
 
                    'f_path': f_path})
 
        if self.search_type == 'content':
 
            content_short = self.get_short_content(res, docid[1])
 
            res.update({'content_short': content_short,
 
                        'content_short_hl': self.highlight(content_short)})
 
        elif self.search_type == 'message':
 
            res.update({'message_hl': self.highlight(res['message'])})
 

	
 
        log.debug('result: %s' % res)
 

	
 
        return res
 

	
 
    def get_short_content(self, res, chunks):
 

	
 
        return ''.join([res['content'][chunk[0]:chunk[1]] for chunk in chunks])
 

	
 
    def get_chunks(self):
 
        """
 
        Smart function that implements chunking the content
 
        but not overlap chunks so it doesn't highlight the same
 
        close occurrences twice.
 

	
 
        :param matcher:
 
        :param size:
 
        """
 
        memory = [(0, 0)]
 
        for span in self.matcher.spans():
 
            start = span.startchar or 0
 
            end = span.endchar or 0
 
            start_offseted = max(0, start - self.fragment_size)
 
            end_offseted = end + self.fragment_size
 
        if self.matcher.supports('positions'): 
 
            for span in self.matcher.spans():
 
                start = span.startchar or 0
 
                end = span.endchar or 0
 
                start_offseted = max(0, start - self.fragment_size)
 
                end_offseted = end + self.fragment_size
 

	
 
            if start_offseted < memory[-1][1]:
 
                start_offseted = memory[-1][1]
 
            memory.append((start_offseted, end_offseted,))
 
            yield (start_offseted, end_offseted,)
 
                if start_offseted < memory[-1][1]:
 
                    start_offseted = memory[-1][1]
 
                memory.append((start_offseted, end_offseted,))
 
                yield (start_offseted, end_offseted,)
 

	
 
    def highlight(self, content, top=5):
 
        if self.search_type != 'content':
 
        if self.search_type not in ['content', 'message']:
 
            return ''
 
        hl = highlight(
 
            text=content,
 
            terms=self.highlight_items,
 
            analyzer=ANALYZER,
 
            fragmenter=FRAGMENTER,
 
            formatter=FORMATTER,
 
            top=top
 
        )
 
        return hl
rhodecode/lib/indexers/daemon.py
Show inline comments
 
# -*- coding: utf-8 -*-
 
"""
 
    rhodecode.lib.indexers.daemon
 
    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 

	
 
    A daemon will read from task table and run tasks
 

	
 
    :created_on: Jan 26, 2010
 
    :author: marcink
 
    :copyright: (C) 2010-2012 Marcin Kuzminski <marcin@python-works.com>
 
    :license: GPLv3, see COPYING for more details.
 
"""
 
# This program is free software: you can redistribute it and/or modify
 
# it under the terms of the GNU General Public License as published by
 
# the Free Software Foundation, either version 3 of the License, or
 
# (at your option) any later version.
 
#
 
# This program is distributed in the hope that it will be useful,
 
# but WITHOUT ANY WARRANTY; without even the implied warranty of
 
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 
# GNU General Public License for more details.
 
#
 
# You should have received a copy of the GNU General Public License
 
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
 

	
 
import os
 
import sys
 
import logging
 
import traceback
 

	
 
from shutil import rmtree
 
from time import mktime
 

	
 
from os.path import dirname as dn
 
from os.path import join as jn
 

	
 
#to get the rhodecode import
 
project_path = dn(dn(dn(dn(os.path.realpath(__file__)))))
 
sys.path.append(project_path)
 

	
 
from rhodecode.config.conf import INDEX_EXTENSIONS
 
from rhodecode.model.scm import ScmModel
 
from rhodecode.lib.utils2 import safe_unicode
 
from rhodecode.lib.indexers import SCHEMA, IDX_NAME
 
from rhodecode.lib.indexers import SCHEMA, IDX_NAME, CHGSETS_SCHEMA, CHGSET_IDX_NAME
 

	
 
from rhodecode.lib.vcs.exceptions import ChangesetError, RepositoryError, \
 
    NodeDoesNotExistError
 

	
 
from whoosh.index import create_in, open_dir
 
from whoosh.index import create_in, open_dir, exists_in
 
from whoosh.query import *
 
from whoosh.qparser import QueryParser
 

	
 
log = logging.getLogger('whoosh_indexer')
 

	
 

	
 
class WhooshIndexingDaemon(object):
 
    """
 
    Daemon for atomic indexing jobs
 
    """
 

	
 
    def __init__(self, indexname=IDX_NAME, index_location=None,
 
                 repo_location=None, sa=None, repo_list=None,
 
                 repo_update_list=None):
 
        self.indexname = indexname
 

	
 
        self.index_location = index_location
 
        if not index_location:
 
            raise Exception('You have to provide index location')
 

	
 
        self.repo_location = repo_location
 
        if not repo_location:
 
            raise Exception('You have to provide repositories location')
 

	
 
        self.repo_paths = ScmModel(sa).repo_scan(self.repo_location)
 

	
 
        #filter repo list
 
        if repo_list:
 
            self.filtered_repo_paths = {}
 
            for repo_name, repo in self.repo_paths.items():
 
                if repo_name in repo_list:
 
                    self.filtered_repo_paths[repo_name] = repo
 

	
 
            self.repo_paths = self.filtered_repo_paths
 

	
 
        #filter update repo list
 
        self.filtered_repo_update_paths = {}
 
        if repo_update_list:
 
            self.filtered_repo_update_paths = {}
 
            for repo_name, repo in self.repo_paths.items():
 
                if repo_name in repo_update_list:
 
                    self.filtered_repo_update_paths[repo_name] = repo
 
            self.repo_paths = self.filtered_repo_update_paths
 

	
 
        self.initial = False
 
        self.initial = True
 
        if not os.path.isdir(self.index_location):
 
            os.makedirs(self.index_location)
 
            log.info('Cannot run incremental index since it does not'
 
                     ' yet exist running full build')
 
            self.initial = True
 
        elif not exists_in(self.index_location, IDX_NAME):
 
            log.info('Running full index build as the file content'
 
                     ' index does not exist')
 
        elif not exists_in(self.index_location, CHGSET_IDX_NAME):
 
            log.info('Running full index build as the changeset'
 
                     ' index does not exist')
 
        else:
 
            self.initial = False
 

	
 
    def get_paths(self, repo):
 
        """
 
        recursive walk in root dir and return a set of all path in that dir
 
        based on repository walk function
 
        """
 
        index_paths_ = set()
 
        try:
 
            tip = repo.get_changeset('tip')
 
            for topnode, dirs, files in tip.walk('/'):
 
                for f in files:
 
                    index_paths_.add(jn(repo.path, f.path))
 

	
 
        except RepositoryError, e:
 
            log.debug(traceback.format_exc())
 
            pass
 
        return index_paths_
 

	
 
    def get_node(self, repo, path):
 
        n_path = path[len(repo.path) + 1:]
 
        node = repo.get_changeset().get_node(n_path)
 
        return node
 

	
 
    def get_node_mtime(self, node):
 
        return mktime(node.last_changeset.date.timetuple())
 

	
 
    def add_doc(self, writer, path, repo, repo_name):
 
        """
 
        Adding doc to writer this function itself fetches data from
 
        the instance of vcs backend
 
        """
 

	
 
        node = self.get_node(repo, path)
 
        indexed = indexed_w_content = 0
 
        # we just index the content of chosen files, and skip binary files
 
        if node.extension in INDEX_EXTENSIONS and not node.is_binary:
 
            u_content = node.content
 
            if not isinstance(u_content, unicode):
 
                log.warning('  >> %s Could not get this content as unicode '
 
                            'replacing with empty content' % path)
 
                u_content = u''
 
            else:
 
                log.debug('    >> %s [WITH CONTENT]' % path)
 
                indexed_w_content += 1
 

	
 
        else:
 
            log.debug('    >> %s' % path)
 
            # just index file name without it's content
 
            u_content = u''
 
            indexed += 1
 

	
 
        p = safe_unicode(path)
 
        writer.add_document(
 
            fileid=p,
 
            owner=unicode(repo.contact),
 
            repository=safe_unicode(repo_name),
 
            path=p,
 
            content=u_content,
 
            modtime=self.get_node_mtime(node),
 
            extension=node.extension
 
        )
 
        return indexed, indexed_w_content
 

	
 
    def build_index(self):
 
        if os.path.exists(self.index_location):
 
            log.debug('removing previous index')
 
            rmtree(self.index_location)
 
    def index_changesets(self, writer, repo_name, repo, start_rev=0):
 
        """
 
        Add all changeset in the vcs repo starting at start_rev
 
        to the index writer
 
        """
 

	
 
        log.debug('indexing changesets in %s[%d:]' % (repo_name, start_rev))
 

	
 
        if not os.path.exists(self.index_location):
 
            os.mkdir(self.index_location)
 
        indexed=0
 
        for cs in repo[start_rev:]:
 
            writer.add_document(
 
                path=unicode(cs.raw_id),
 
                owner=unicode(repo.contact),
 
                repository=safe_unicode(repo_name),
 
                author=cs.author,
 
                message=cs.message,
 
                revision=cs.revision,
 
                last=cs.last,
 
                added=u' '.join([node.path for node in cs.added]).lower(),
 
                removed=u' '.join([node.path for node in cs.removed]).lower(),
 
                changed=u' '.join([node.path for node in cs.changed]).lower(),
 
                parents=u' '.join([cs.raw_id for cs in cs.parents]),
 
            )
 
            indexed += 1
 

	
 
        idx = create_in(self.index_location, SCHEMA, indexname=IDX_NAME)
 
        writer = idx.writer()
 
        log.debug('BUILDING INDEX FOR EXTENSIONS %s '
 
                  'AND REPOS %s' % (INDEX_EXTENSIONS, self.repo_paths.keys()))
 
        log.debug('indexed %d changesets for repo %s' % (indexed, repo_name))
 

	
 
    def index_files(self, file_idx_writer, repo_name, repo):
 
        i_cnt = iwc_cnt = 0
 
        log.debug('building index for [%s]' % repo.path)
 
        for idx_path in self.get_paths(repo):
 
            i, iwc = self.add_doc(file_idx_writer, idx_path, repo, repo_name)
 
            i_cnt += i
 
            iwc_cnt += iwc
 

	
 
        log.debug('added %s files %s with content for repo %s' % (i_cnt + iwc_cnt, iwc_cnt, repo.path))
 

	
 
    def update_changeset_index(self):
 
        idx = open_dir(self.index_location, indexname=CHGSET_IDX_NAME)
 

	
 
        for repo_name, repo in self.repo_paths.items():
 
            log.debug('building index @ %s' % repo.path)
 
            i_cnt = iwc_cnt = 0
 
            for idx_path in self.get_paths(repo):
 
                i, iwc = self.add_doc(writer, idx_path, repo, repo_name)
 
                i_cnt += i
 
                iwc_cnt += iwc
 
            log.debug('added %s files %s with content for repo %s' % (
 
                         i_cnt + iwc_cnt, iwc_cnt, repo.path)
 
            )
 
        with idx.searcher() as searcher:
 
            writer = idx.writer()
 
            writer_is_dirty = False
 
            try:
 
                for repo_name, repo in self.repo_paths.items():
 
                    # skip indexing if there aren't any revs in the repo
 
                    revs = repo.revisions
 
                    if len(revs) < 1:
 
                        continue
 

	
 
                    qp = QueryParser('repository', schema=CHGSETS_SCHEMA)
 
                    q = qp.parse(u"last:t AND %s" % repo_name)
 

	
 
                    results = searcher.search(q, sortedby='revision')
 

	
 
                    last_rev = 0
 
                    if len(results) > 0:
 
                        last_rev = results[0]['revision']
 

	
 
        log.debug('>> COMMITING CHANGES <<')
 
        writer.commit(merge=True)
 
        log.debug('>>> FINISHED BUILDING INDEX <<<')
 
                    # there are new changesets to index or a new repo to index
 
                    if last_rev == 0 or len(revs) > last_rev + 1:
 
                        # delete the docs in the index for the previous last changeset(s)
 
                        for hit in results:
 
                            q = qp.parse(u"last:t AND %s AND path:%s" % 
 
                                            (repo_name, hit['path']))
 
                            writer.delete_by_query(q)
 

	
 
    def update_index(self):
 
                        # index from the previous last changeset + all new ones
 
                        self.index_changesets(writer, repo_name, repo, last_rev)
 
                        writer_is_dirty = True
 

	
 
            finally:
 
                if writer_is_dirty:
 
                    log.debug('>> COMMITING CHANGES TO CHANGESET INDEX<<')
 
                    writer.commit(merge=True)
 
                    log.debug('>> COMMITTED CHANGES TO CHANGESET INDEX<<')
 
                else:
 
                    writer.cancel
 

	
 
    def update_file_index(self):
 
        log.debug((u'STARTING INCREMENTAL INDEXING UPDATE FOR EXTENSIONS %s '
 
                   'AND REPOS %s') % (INDEX_EXTENSIONS, self.repo_paths.keys()))
 

	
 
        idx = open_dir(self.index_location, indexname=self.indexname)
 
        # The set of all paths in the index
 
        indexed_paths = set()
 
        # The set of all paths we need to re-index
 
        to_index = set()
 

	
 
        reader = idx.reader()
 
        writer = idx.writer()
 
        writer_is_dirty = False
 
        try:
 
            with idx.reader() as reader:
 

	
 
                # Loop over the stored fields in the index
 
                for fields in reader.all_stored_fields():
 
                    indexed_path = fields['path']
 
                    indexed_repo_path = fields['repository']
 
                    indexed_paths.add(indexed_path)
 

	
 
                    if not indexed_repo_path in self.filtered_repo_update_paths:
 
                        continue
 

	
 
                    repo = self.repo_paths[indexed_repo_path]
 

	
 
                    try:
 
                        node = self.get_node(repo, indexed_path)
 
                        # Check if this file was changed since it was indexed
 
                        indexed_time = fields['modtime']
 
                        mtime = self.get_node_mtime(node)
 
                        if mtime > indexed_time:
 
                            # The file has changed, delete it and add it to the list of
 
                            # files to reindex
 
                            log.debug('adding to reindex list %s mtime: %s vs %s' % (
 
                                            indexed_path, mtime, indexed_time)
 
                            )
 
                            writer.delete_by_term('fileid', indexed_path)
 
                            writer_is_dirty = True
 

	
 
                            to_index.add(indexed_path)
 
                    except (ChangesetError, NodeDoesNotExistError):
 
                        # This file was deleted since it was indexed
 
                        log.debug('removing from index %s' % indexed_path)
 
                        writer.delete_by_term('path', indexed_path)
 
                        writer_is_dirty = True
 

	
 
        # Loop over the stored fields in the index
 
        for fields in reader.all_stored_fields():
 
            indexed_path = fields['path']
 
            indexed_repo_path = fields['repository']
 
            indexed_paths.add(indexed_path)
 
            # Loop over the files in the filesystem
 
            # Assume we have a function that gathers the filenames of the
 
            # documents to be indexed
 
            ri_cnt_total = 0  # indexed
 
            riwc_cnt_total = 0  # indexed with content
 
            for repo_name, repo in self.repo_paths.items():
 
                # skip indexing if there aren't any revisions
 
                if len(repo) < 1:
 
                    continue
 
                ri_cnt = 0   # indexed
 
                riwc_cnt = 0  # indexed with content
 
                for path in self.get_paths(repo):
 
                    path = safe_unicode(path)
 
                    if path in to_index or path not in indexed_paths:
 

	
 
            if not indexed_repo_path in self.filtered_repo_update_paths:
 
                        # This is either a file that's changed, or a new file
 
                        # that wasn't indexed before. So index it!
 
                        i, iwc = self.add_doc(writer, path, repo, repo_name)
 
                        writer_is_dirty = True
 
                        log.debug('re indexing %s' % path)
 
                        ri_cnt += i
 
                        ri_cnt_total += 1
 
                        riwc_cnt += iwc
 
                        riwc_cnt_total += iwc
 
                log.debug('added %s files %s with content for repo %s' % (
 
                             ri_cnt + riwc_cnt, riwc_cnt, repo.path)
 
                )
 
            log.debug('indexed %s files in total and %s with content' % (
 
                        ri_cnt_total, riwc_cnt_total)
 
            )
 
        finally:
 
            if writer_is_dirty:
 
                log.debug('>> COMMITING CHANGES <<')
 
                writer.commit(merge=True)
 
                log.debug('>>> FINISHED REBUILDING INDEX <<<')
 
            else:
 
                writer.cancel()
 

	
 
    def build_indexes(self):
 
        if os.path.exists(self.index_location):
 
            log.debug('removing previous index')
 
            rmtree(self.index_location)
 

	
 
        if not os.path.exists(self.index_location):
 
            os.mkdir(self.index_location)
 

	
 
        chgset_idx = create_in(self.index_location, CHGSETS_SCHEMA, indexname=CHGSET_IDX_NAME)
 
        chgset_idx_writer = chgset_idx.writer()
 

	
 
        file_idx = create_in(self.index_location, SCHEMA, indexname=IDX_NAME)
 
        file_idx_writer = file_idx.writer()
 
        log.debug('BUILDING INDEX FOR EXTENSIONS %s '
 
                  'AND REPOS %s' % (INDEX_EXTENSIONS, self.repo_paths.keys()))
 

	
 
        for repo_name, repo in self.repo_paths.items():
 
            # skip indexing if there aren't any revisions
 
            if len(repo) < 1:
 
                continue
 

	
 
            repo = self.repo_paths[indexed_repo_path]
 

	
 
            try:
 
                node = self.get_node(repo, indexed_path)
 
                # Check if this file was changed since it was indexed
 
                indexed_time = fields['modtime']
 
                mtime = self.get_node_mtime(node)
 
                if mtime > indexed_time:
 
                    # The file has changed, delete it and add it to the list of
 
                    # files to reindex
 
                    log.debug('adding to reindex list %s mtime: %s vs %s' % (
 
                                    indexed_path, mtime, indexed_time)
 
                    )
 
                    writer.delete_by_term('fileid', indexed_path)
 

	
 
                    to_index.add(indexed_path)
 
            except (ChangesetError, NodeDoesNotExistError):
 
                # This file was deleted since it was indexed
 
                log.debug('removing from index %s' % indexed_path)
 
                writer.delete_by_term('path', indexed_path)
 
            self.index_files(file_idx_writer, repo_name, repo)
 
            self.index_changesets(chgset_idx_writer, repo_name, repo)
 

	
 
        # Loop over the files in the filesystem
 
        # Assume we have a function that gathers the filenames of the
 
        # documents to be indexed
 
        ri_cnt_total = 0  # indexed
 
        riwc_cnt_total = 0  # indexed with content
 
        for repo_name, repo in self.repo_paths.items():
 
            ri_cnt = 0   # indexed
 
            riwc_cnt = 0  # indexed with content
 
            for path in self.get_paths(repo):
 
                path = safe_unicode(path)
 
                if path in to_index or path not in indexed_paths:
 
        log.debug('>> COMMITING CHANGES <<')
 
        file_idx_writer.commit(merge=True)
 
        chgset_idx_writer.commit(merge=True)
 
        log.debug('>>> FINISHED BUILDING INDEX <<<')
 

	
 
                    # This is either a file that's changed, or a new file
 
                    # that wasn't indexed before. So index it!
 
                    i, iwc = self.add_doc(writer, path, repo, repo_name)
 
                    log.debug('re indexing %s' % path)
 
                    ri_cnt += i
 
                    ri_cnt_total += 1
 
                    riwc_cnt += iwc
 
                    riwc_cnt_total += iwc
 
            log.debug('added %s files %s with content for repo %s' % (
 
                         ri_cnt + riwc_cnt, riwc_cnt, repo.path)
 
            )
 
        log.debug('indexed %s files in total and %s with content' % (
 
                    ri_cnt_total, riwc_cnt_total)
 
        )
 
        log.debug('>> COMMITING CHANGES <<')
 
        writer.commit(merge=True)
 
        log.debug('>>> FINISHED REBUILDING INDEX <<<')
 
    def update_indexes(self):
 
        self.update_file_index()
 
        self.update_changeset_index()
 

	
 
    def run(self, full_index=False):
 
        """Run daemon"""
 
        if full_index or self.initial:
 
            self.build_index()
 
            self.build_indexes()
 
        else:
 
            self.update_index()
 
            self.update_indexes()
rhodecode/templates/search/search.html
Show inline comments
 
@@ -16,72 +16,72 @@
 
   %endif
 
	- ${c.rhodecode_name}
 
</%def>
 
<%def name="breadcrumbs()">
 
	${c.rhodecode_name}
 
</%def>
 
<%def name="page_nav()">
 
	${self.menu('home')}
 
</%def>
 
<%def name="main()">
 

	
 
<div class="box">
 
	<!-- box / title -->
 
	<div class="title">
 
		<h5>
 
		%if c.repo_name:
 
			${_('Search in repository: %s') % c.repo_name}
 
		%else:
 
			${_('Search in all repositories')}
 
		%endif
 
		</h5>
 
	</div>
 
	<!-- end box / title -->
 
	%if c.repo_name:
 
		${h.form(h.url('search_repo',search_repo=c.repo_name),method='get')}
 
	%else:
 
		${h.form(h.url('search'),method='get')}
 
	%endif
 
	<div class="form">
 
		<div class="fields">
 
			<div class="field field-first field-noborder">
 
             <div class="label">
 
                 <label for="q">${_('Search term')}</label>
 
             </div>
 
				<div class="input">${h.text('q',c.cur_query,class_="small")}
 
					<div class="button highlight">
 
						<input type="submit" value="${_('Search')}" class="ui-button"/>
 
					</div>
 
				</div>
 
				<div style="font-weight: bold;clear:Both;margin-left:200px">${c.runtime}</div>
 
			</div>
 

	
 
			<div class="field">
 
	            <div class="label">
 
	                <label for="type">${_('Search in')}</label>
 
	            </div>
 
                <div class="select">
 
                    ${h.select('type',c.cur_type,[('content',_('File contents')),
 
                        ##('commit',_('Commit messages')),
 
                        ('commit',_('Commit messages')),
 
                        ('path',_('File names')),
 
                        ##('repository',_('Repository names')),
 
                        ])}
 
                </div>
 
             </div>
 

	
 
		</div>
 
	</div>
 
	${h.end_form()}
 
    <div class="search">
 
    %if c.cur_search == 'content':
 
    %if c.cur_type == 'content':
 
        <%include file='search_content.html'/>
 
    %elif c.cur_search == 'path':
 
    %elif c.cur_type == 'path':
 
        <%include file='search_path.html'/>
 
    %elif c.cur_search == 'commit':
 
    %elif c.cur_type == 'commit':
 
        <%include file='search_commit.html'/>
 
    %elif c.cur_search == 'repository':
 
    %elif c.cur_type == 'repository':
 
        <%include file='search_repository.html'/>
 
    %endif
 
    </div>
 
</div>
 

	
 
</%def>
rhodecode/templates/search/search_commit.html
Show inline comments
 
##commit highligthing
 

	
 
%for cnt,sr in enumerate(c.formated_results):
 
    %if h.HasRepoPermissionAny('repository.write','repository.read','repository.admin')(sr['repository'],'search results check'):
 
    <div class="table">
 
        <div id="body${cnt}" class="codeblock">
 
            <div class="code-header">
 
                <div class="search-path">${h.link_to(h.literal('%s &raquo; %s' % (sr['repository'],sr['f_path'])),
 
                h.url('changeset_home',repo_name=sr['repository'],revision=sr['path']))}
 
                </div>
 
            </div>
 
            <div class="left">
 
                <div class="author">
 
                    <div class="gravatar">
 
                        <img alt="gravatar" src="${h.gravatar_url(h.email(sr['author']),20)}"/>
 
                    </div>
 
                    <span>${h.person(sr['author'])}</span><br/>
 
                    <span><a href="mailto:${h.email_or_none(sr['author'])}">${h.email_or_none(sr['author'])}</a></span><br/>
 
                </div>
 
                %if sr['message_hl']:
 
                <div class="search-code-body">
 
                    <pre>${h.literal(sr['message_hl'])}</pre>
 
                </div>
 
                %else:
 
                <div class="message">${h.urlify_commit(sr['message'], sr['repository'])}</div>
 
                %endif
 
            </div>
 
        </div>
 
    </div>
 
    %else:
 
        %if cnt == 0:
 
        <div class="table">
 
            <div id="body${cnt}" class="codeblock">
 
                <div class="error">${_('Permission denied')}</div>
 
            </div>
 
        </div>
 
        %endif
 
    %endif
 
%endfor
 
%if c.cur_query and c.formated_results:
 
<div class="pagination-wh pagination-left">
 
    ${c.formated_results.pager('$link_previous ~2~ $link_next')}
 
</div>
 
%endif
rhodecode/tests/functional/test_search.py
Show inline comments
 
import os
 
from rhodecode.tests import *
 
from nose.plugins.skip import SkipTest
 

	
 

	
 
class TestSearchController(TestController):
 

	
 
    def test_index(self):
 
        self.log_user()
 
        response = self.app.get(url(controller='search', action='index'))
 

	
 
        self.assertTrue('class="small" id="q" name="q" type="text"' in
 
                        response.body)
 
        # Test response...
 

	
 
    def test_empty_search(self):
 
        if os.path.isdir(self.index_location):
 
            raise SkipTest('skipped due to existing index')
 
        else:
 
            self.log_user()
 
            response = self.app.get(url(controller='search', action='index'),
 
                                    {'q': HG_REPO})
 
            self.assertTrue('There is no index to search in. '
 
                            'Please run whoosh indexer' in response.body)
 

	
 
    def test_normal_search(self):
 
        self.log_user()
 
        response = self.app.get(url(controller='search', action='index'),
 
                                {'q': 'def repo'})
 
        response.mustcontain('39 results')
 
        response.mustcontain('10 results')
 

	
 
    def test_repo_search(self):
 
        self.log_user()
 
        response = self.app.get(url(controller='search', action='index'),
 
                                {'q': 'repository:%s def test' % HG_REPO})
 

	
 
        response.mustcontain('4 results')
 

	
 
    def test_search_last(self):
 
        self.log_user()
 
        response = self.app.get(url(controller='search', action='index'),
 
                                {'q': 'last:t', 'type': 'commit'})
 

	
 
        response.mustcontain('1 results')
 

	
 
    def test_search_commit_message(self):
 
        self.log_user()
 
        response = self.app.get(url(controller='search', action='index'),
 
                                {'q': 'bother to ask where to fetch repo during tests',
 
                                 'type': 'commit'})
 

	
 
        response.mustcontain('1 results')
 
        response.mustcontain('a00c1b6f5d7a6ae678fd553a8b81d92367f7ecf1')
 

	
 
    def test_search_commit_changed_file(self):
 
        self.log_user()
 
        response = self.app.get(url(controller='search', action='index'),
 
                                {'q': 'changed:tests/utils.py',
 
                                 'type': 'commit'})
 

	
 
        response.mustcontain('a00c1b6f5d7a6ae678fd553a8b81d92367f7ecf1')
 

	
 
    def test_search_commit_added_file(self):
 
        self.log_user()
 
        response = self.app.get(url(controller='search', action='index'),
 
                                {'q': 'added:README.rst',
 
                                 'type': 'commit'})
 

	
 
        response.mustcontain('1 results')
 
        response.mustcontain('3803844fdbd3b711175fc3da9bdacfcd6d29a6fb')
 

	
 
    def test_search_author(self):
 
        self.log_user()
 
        response = self.app.get(url(controller='search', action='index'),
 
                                {'q': 'author:marcin@python-blog.com revision:0',
 
                                 'type': 'commit'})
 

	
 
        response.mustcontain('1 results')
0 comments (0 inline, 0 general)