Changeset - 5f21a9dcb09d
[Not reviewed]
beta
0 6 0
Indra Talip - 13 years ago 2012-07-20 12:50:56
indra.talip@gmail.com
create an index for commit messages and the ability to search them and see results
6 files changed with 276 insertions and 46 deletions:
0 comments (0 inline, 0 general)
rhodecode/controllers/search.py
Show inline comments
 
@@ -30,7 +30,7 @@ from pylons import request, config, tmpl
 

	
 
from rhodecode.lib.auth import LoginRequired
 
from rhodecode.lib.base import BaseController, render
 
from rhodecode.lib.indexers import SCHEMA, IDX_NAME, WhooshResultWrapper
 
from rhodecode.lib.indexers import CHGSETS_SCHEMA, SCHEMA, CHGSET_IDX_NAME, IDX_NAME, WhooshResultWrapper
 

	
 
from webhelpers.paginate import Page
 
from webhelpers.util import update_params
 
@@ -54,25 +54,41 @@ class SearchController(BaseController):
 
        c.formated_results = []
 
        c.runtime = ''
 
        c.cur_query = request.GET.get('q', None)
 
        c.cur_type = request.GET.get('type', 'source')
 
        c.cur_type = request.GET.get('type', 'content')
 
        c.cur_search = search_type = {'content': 'content',
 
                                      'commit': 'content',
 
                                      'commit': 'message',
 
                                      'path': 'path',
 
                                      'repository': 'repository'}\
 
                                      .get(c.cur_type, 'content')
 

	
 
        index_name = {
 
            'content': IDX_NAME,
 
            'commit': CHGSET_IDX_NAME,
 
            'path': IDX_NAME}\
 
            .get(c.cur_type, IDX_NAME)
 

	
 
        schema_defn = {
 
            'content': SCHEMA,
 
            'commit': CHGSETS_SCHEMA,
 
            'path': SCHEMA}\
 
            .get(c.cur_type, SCHEMA)
 

	
 
        log.debug('IDX: %s' % index_name)
 
        log.debug('SCHEMA: %s' % schema_defn)
 

	
 
        if c.cur_query:
 
            cur_query = c.cur_query.lower()
 
            log.debug(cur_query)
 

	
 
        if c.cur_query:
 
            p = int(request.params.get('page', 1))
 
            highlight_items = set()
 
            try:
 
                idx = open_dir(config['app_conf']['index_dir'],
 
                               indexname=IDX_NAME)
 
                               indexname=index_name)
 
                searcher = idx.searcher()
 

	
 
                qp = QueryParser(search_type, schema=SCHEMA)
 
                qp = QueryParser(search_type, schema=schema_defn)
 
                if c.repo_name:
 
                    cur_query = u'repository:%s %s' % (c.repo_name, cur_query)
 
                try:
 
@@ -84,13 +100,13 @@ class SearchController(BaseController):
 
                        highlight_items.add(query.text)
 
                    else:
 
                        for i in query.all_terms():
 
                            if i[0] == 'content':
 
                            if i[0] in ['content', 'message']:
 
                                highlight_items.add(i[1])
 

	
 
                    matcher = query.matcher(searcher)
 

	
 
                    log.debug(query)
 
                    log.debug(highlight_items)
 
                    log.debug('query: %s' % query)
 
                    log.debug('hl terms: %s' % highlight_items)
 
                    results = searcher.search(query)
 
                    res_ln = len(results)
 
                    c.runtime = '%s results (%.3f seconds)' % (
 
@@ -99,7 +115,7 @@ class SearchController(BaseController):
 

	
 
                    def url_generator(**kw):
 
                        return update_params("?q=%s&type=%s" \
 
                                           % (c.cur_query, c.cur_search), **kw)
 
                                           % (c.cur_query, c.cur_type), **kw)
 
                    repo_location = RepoModel().repos_path
 
                    c.formated_results = Page(
 
                        WhooshResultWrapper(search_type, searcher, matcher,
rhodecode/lib/indexers/__init__.py
Show inline comments
 
@@ -35,7 +35,7 @@ from string import strip
 
from shutil import rmtree
 

	
 
from whoosh.analysis import RegexTokenizer, LowercaseFilter, StopFilter
 
from whoosh.fields import TEXT, ID, STORED, Schema, FieldType
 
from whoosh.fields import TEXT, ID, STORED, NUMERIC, BOOLEAN, Schema, FieldType
 
from whoosh.index import create_in, open_dir
 
from whoosh.formats import Characters
 
from whoosh.highlight import highlight, HtmlFormatter, ContextFragmenter
 
@@ -51,10 +51,11 @@ from rhodecode.lib.utils2 import LazyPro
 
from rhodecode.lib.utils import BasePasterCommand, Command, add_cache,\
 
    load_rcextensions
 

	
 
log = logging.getLogger(__name__)
 

	
 
# CUSTOM ANALYZER wordsplit + lowercase filter
 
ANALYZER = RegexTokenizer(expression=r"\w+") | LowercaseFilter()
 

	
 

	
 
#INDEX SCHEMA DEFINITION
 
SCHEMA = Schema(
 
    fileid=ID(unique=True),
 
@@ -71,6 +72,22 @@ IDX_NAME = 'HG_INDEX'
 
FORMATTER = HtmlFormatter('span', between='\n<span class="break">...</span>\n')
 
FRAGMENTER = ContextFragmenter(200)
 

	
 
CHGSETS_SCHEMA = Schema(
 
    path=ID(unique=True, stored=True),
 
    revision=NUMERIC(unique=True, stored=True),
 
    last=BOOLEAN(),
 
    owner=TEXT(),
 
    repository=ID(unique=True, stored=True),
 
    author=TEXT(stored=True),
 
    message=FieldType(format=Characters(), analyzer=ANALYZER,
 
                      scorable=True, stored=True),
 
    parents=TEXT(),
 
    added=TEXT(),
 
    removed=TEXT(),
 
    changed=TEXT(),
 
)
 

	
 
CHGSET_IDX_NAME = 'CHGSET_INDEX'
 

	
 
class MakeIndex(BasePasterCommand):
 

	
 
@@ -191,14 +208,20 @@ class WhooshResultWrapper(object):
 

	
 
    def get_full_content(self, docid):
 
        res = self.searcher.stored_fields(docid[0])
 
        log.debug('result: %s' % res)
 
        full_repo_path = jn(self.repo_location, res['repository'])
 
        f_path = res['path'].split(full_repo_path)[-1]
 
        f_path = f_path.lstrip(os.sep)
 
        res.update({'f_path': f_path})
 

	
 
        if self.search_type == 'content':
 
        content_short = self.get_short_content(res, docid[1])
 
        res.update({'content_short': content_short,
 
                    'content_short_hl': self.highlight(content_short),
 
                    'f_path': f_path})
 
                        'content_short_hl': self.highlight(content_short)})
 
        elif self.search_type == 'message':
 
            res.update({'message_hl': self.highlight(res['message'])})
 

	
 
        log.debug('result: %s' % res)
 

	
 
        return res
 

	
 
@@ -216,6 +239,7 @@ class WhooshResultWrapper(object):
 
        :param size:
 
        """
 
        memory = [(0, 0)]
 
        if self.matcher.supports('positions'): 
 
        for span in self.matcher.spans():
 
            start = span.startchar or 0
 
            end = span.endchar or 0
 
@@ -228,7 +252,7 @@ class WhooshResultWrapper(object):
 
            yield (start_offseted, end_offseted,)
 

	
 
    def highlight(self, content, top=5):
 
        if self.search_type != 'content':
 
        if self.search_type not in ['content', 'message']:
 
            return ''
 
        hl = highlight(
 
            text=content,
rhodecode/lib/indexers/daemon.py
Show inline comments
 
@@ -41,12 +41,14 @@ sys.path.append(project_path)
 
from rhodecode.config.conf import INDEX_EXTENSIONS
 
from rhodecode.model.scm import ScmModel
 
from rhodecode.lib.utils2 import safe_unicode
 
from rhodecode.lib.indexers import SCHEMA, IDX_NAME
 
from rhodecode.lib.indexers import SCHEMA, IDX_NAME, CHGSETS_SCHEMA, CHGSET_IDX_NAME
 

	
 
from rhodecode.lib.vcs.exceptions import ChangesetError, RepositoryError, \
 
    NodeDoesNotExistError
 

	
 
from whoosh.index import create_in, open_dir
 
from whoosh.index import create_in, open_dir, exists_in
 
from whoosh.query import *
 
from whoosh.qparser import QueryParser
 

	
 
log = logging.getLogger('whoosh_indexer')
 

	
 
@@ -89,12 +91,19 @@ class WhooshIndexingDaemon(object):
 
                    self.filtered_repo_update_paths[repo_name] = repo
 
            self.repo_paths = self.filtered_repo_update_paths
 

	
 
        self.initial = False
 
        self.initial = True
 
        if not os.path.isdir(self.index_location):
 
            os.makedirs(self.index_location)
 
            log.info('Cannot run incremental index since it does not'
 
                     ' yet exist running full build')
 
            self.initial = True
 
        elif not exists_in(self.index_location, IDX_NAME):
 
            log.info('Running full index build as the file content'
 
                     ' index does not exist')
 
        elif not exists_in(self.index_location, CHGSET_IDX_NAME):
 
            log.info('Running full index build as the changeset'
 
                     ' index does not exist')
 
        else:
 
            self.initial = False
 

	
 
    def get_paths(self, repo):
 
        """
 
@@ -158,35 +167,86 @@ class WhooshIndexingDaemon(object):
 
        )
 
        return indexed, indexed_w_content
 

	
 
    def build_index(self):
 
        if os.path.exists(self.index_location):
 
            log.debug('removing previous index')
 
            rmtree(self.index_location)
 
    def index_changesets(self, writer, repo_name, repo, start_rev=0):
 
        """
 
        Add all changeset in the vcs repo starting at start_rev
 
        to the index writer
 
        """
 

	
 
        if not os.path.exists(self.index_location):
 
            os.mkdir(self.index_location)
 
        log.debug('indexing changesets in %s[%d:]' % (repo_name, start_rev))
 

	
 
        idx = create_in(self.index_location, SCHEMA, indexname=IDX_NAME)
 
        writer = idx.writer()
 
        log.debug('BUILDING INDEX FOR EXTENSIONS %s '
 
                  'AND REPOS %s' % (INDEX_EXTENSIONS, self.repo_paths.keys()))
 
        indexed=0
 
        for cs in repo[start_rev:]:
 
            writer.add_document(
 
                path=unicode(cs.raw_id),
 
                owner=unicode(repo.contact),
 
                repository=safe_unicode(repo_name),
 
                author=cs.author,
 
                message=cs.message,
 
                revision=cs.revision,
 
                last=cs.last,
 
                added=u' '.join([node.path for node in cs.added]).lower(),
 
                removed=u' '.join([node.path for node in cs.removed]).lower(),
 
                changed=u' '.join([node.path for node in cs.changed]).lower(),
 
                parents=u' '.join([cs.raw_id for cs in cs.parents]),
 
            )
 
            indexed += 1
 

	
 
        for repo_name, repo in self.repo_paths.items():
 
            log.debug('building index @ %s' % repo.path)
 
        log.debug('indexed %d changesets for repo %s' % (indexed, repo_name))
 

	
 
    def index_files(self, file_idx_writer, repo_name, repo):
 
            i_cnt = iwc_cnt = 0
 
        log.debug('building index for [%s]' % repo.path)
 
            for idx_path in self.get_paths(repo):
 
                i, iwc = self.add_doc(writer, idx_path, repo, repo_name)
 
            i, iwc = self.add_doc(file_idx_writer, idx_path, repo, repo_name)
 
                i_cnt += i
 
                iwc_cnt += iwc
 
            log.debug('added %s files %s with content for repo %s' % (
 
                         i_cnt + iwc_cnt, iwc_cnt, repo.path)
 
            )
 

	
 
        log.debug('added %s files %s with content for repo %s' % (i_cnt + iwc_cnt, iwc_cnt, repo.path))
 

	
 
    def update_changeset_index(self):
 
        idx = open_dir(self.index_location, indexname=CHGSET_IDX_NAME)
 

	
 
        with idx.searcher() as searcher:
 
            writer = idx.writer()
 
            writer_is_dirty = False
 
            try:
 
                for repo_name, repo in self.repo_paths.items():
 
                    # skip indexing if there aren't any revs in the repo
 
                    revs = repo.revisions
 
                    if len(revs) < 1:
 
                        continue
 

	
 
                    qp = QueryParser('repository', schema=CHGSETS_SCHEMA)
 
                    q = qp.parse(u"last:t AND %s" % repo_name)
 

	
 
                    results = searcher.search(q, sortedby='revision')
 

	
 
        log.debug('>> COMMITING CHANGES <<')
 
                    last_rev = 0
 
                    if len(results) > 0:
 
                        last_rev = results[0]['revision']
 

	
 
                    # there are new changesets to index or a new repo to index
 
                    if last_rev == 0 or len(revs) > last_rev + 1:
 
                        # delete the docs in the index for the previous last changeset(s)
 
                        for hit in results:
 
                            q = qp.parse(u"last:t AND %s AND path:%s" % 
 
                                            (repo_name, hit['path']))
 
                            writer.delete_by_query(q)
 

	
 
                        # index from the previous last changeset + all new ones
 
                        self.index_changesets(writer, repo_name, repo, last_rev)
 
                        writer_is_dirty = True
 

	
 
            finally:
 
                if writer_is_dirty:
 
                    log.debug('>> COMMITING CHANGES TO CHANGESET INDEX<<')
 
        writer.commit(merge=True)
 
        log.debug('>>> FINISHED BUILDING INDEX <<<')
 
                    log.debug('>> COMMITTED CHANGES TO CHANGESET INDEX<<')
 
                else:
 
                    writer.cancel
 

	
 
    def update_index(self):
 
    def update_file_index(self):
 
        log.debug((u'STARTING INCREMENTAL INDEXING UPDATE FOR EXTENSIONS %s '
 
                   'AND REPOS %s') % (INDEX_EXTENSIONS, self.repo_paths.keys()))
 

	
 
@@ -196,8 +256,10 @@ class WhooshIndexingDaemon(object):
 
        # The set of all paths we need to re-index
 
        to_index = set()
 

	
 
        reader = idx.reader()
 
        writer = idx.writer()
 
        writer_is_dirty = False
 
        try:
 
            with idx.reader() as reader:
 

	
 
        # Loop over the stored fields in the index
 
        for fields in reader.all_stored_fields():
 
@@ -222,12 +284,14 @@ class WhooshIndexingDaemon(object):
 
                                    indexed_path, mtime, indexed_time)
 
                    )
 
                    writer.delete_by_term('fileid', indexed_path)
 
                            writer_is_dirty = True
 

	
 
                    to_index.add(indexed_path)
 
            except (ChangesetError, NodeDoesNotExistError):
 
                # This file was deleted since it was indexed
 
                log.debug('removing from index %s' % indexed_path)
 
                writer.delete_by_term('path', indexed_path)
 
                        writer_is_dirty = True
 

	
 
        # Loop over the files in the filesystem
 
        # Assume we have a function that gathers the filenames of the
 
@@ -235,6 +299,9 @@ class WhooshIndexingDaemon(object):
 
        ri_cnt_total = 0  # indexed
 
        riwc_cnt_total = 0  # indexed with content
 
        for repo_name, repo in self.repo_paths.items():
 
                # skip indexing if there aren't any revisions
 
                if len(repo) < 1:
 
                    continue
 
            ri_cnt = 0   # indexed
 
            riwc_cnt = 0  # indexed with content
 
            for path in self.get_paths(repo):
 
@@ -244,6 +311,7 @@ class WhooshIndexingDaemon(object):
 
                    # This is either a file that's changed, or a new file
 
                    # that wasn't indexed before. So index it!
 
                    i, iwc = self.add_doc(writer, path, repo, repo_name)
 
                        writer_is_dirty = True
 
                    log.debug('re indexing %s' % path)
 
                    ri_cnt += i
 
                    ri_cnt_total += 1
 
@@ -255,13 +323,50 @@ class WhooshIndexingDaemon(object):
 
        log.debug('indexed %s files in total and %s with content' % (
 
                    ri_cnt_total, riwc_cnt_total)
 
        )
 
        finally:
 
            if writer_is_dirty:
 
        log.debug('>> COMMITING CHANGES <<')
 
        writer.commit(merge=True)
 
        log.debug('>>> FINISHED REBUILDING INDEX <<<')
 
            else:
 
                writer.cancel()
 

	
 
    def build_indexes(self):
 
        if os.path.exists(self.index_location):
 
            log.debug('removing previous index')
 
            rmtree(self.index_location)
 

	
 
        if not os.path.exists(self.index_location):
 
            os.mkdir(self.index_location)
 

	
 
        chgset_idx = create_in(self.index_location, CHGSETS_SCHEMA, indexname=CHGSET_IDX_NAME)
 
        chgset_idx_writer = chgset_idx.writer()
 

	
 
        file_idx = create_in(self.index_location, SCHEMA, indexname=IDX_NAME)
 
        file_idx_writer = file_idx.writer()
 
        log.debug('BUILDING INDEX FOR EXTENSIONS %s '
 
                  'AND REPOS %s' % (INDEX_EXTENSIONS, self.repo_paths.keys()))
 

	
 
        for repo_name, repo in self.repo_paths.items():
 
            # skip indexing if there aren't any revisions
 
            if len(repo) < 1:
 
                continue
 

	
 
            self.index_files(file_idx_writer, repo_name, repo)
 
            self.index_changesets(chgset_idx_writer, repo_name, repo)
 

	
 
        log.debug('>> COMMITING CHANGES <<')
 
        file_idx_writer.commit(merge=True)
 
        chgset_idx_writer.commit(merge=True)
 
        log.debug('>>> FINISHED BUILDING INDEX <<<')
 

	
 
    def update_indexes(self):
 
        self.update_file_index()
 
        self.update_changeset_index()
 

	
 
    def run(self, full_index=False):
 
        """Run daemon"""
 
        if full_index or self.initial:
 
            self.build_index()
 
            self.build_indexes()
 
        else:
 
            self.update_index()
 
            self.update_indexes()
rhodecode/templates/search/search.html
Show inline comments
 
@@ -61,7 +61,7 @@
 
	            </div>
 
                <div class="select">
 
                    ${h.select('type',c.cur_type,[('content',_('File contents')),
 
                        ##('commit',_('Commit messages')),
 
                        ('commit',_('Commit messages')),
 
                        ('path',_('File names')),
 
                        ##('repository',_('Repository names')),
 
                        ])}
 
@@ -72,13 +72,13 @@
 
	</div>
 
	${h.end_form()}
 
    <div class="search">
 
    %if c.cur_search == 'content':
 
    %if c.cur_type == 'content':
 
        <%include file='search_content.html'/>
 
    %elif c.cur_search == 'path':
 
    %elif c.cur_type == 'path':
 
        <%include file='search_path.html'/>
 
    %elif c.cur_search == 'commit':
 
    %elif c.cur_type == 'commit':
 
        <%include file='search_commit.html'/>
 
    %elif c.cur_search == 'repository':
 
    %elif c.cur_type == 'repository':
 
        <%include file='search_repository.html'/>
 
    %endif
 
    </div>
rhodecode/templates/search/search_commit.html
Show inline comments
 
##commit highligthing
 

	
 
%for cnt,sr in enumerate(c.formated_results):
 
    %if h.HasRepoPermissionAny('repository.write','repository.read','repository.admin')(sr['repository'],'search results check'):
 
    <div class="table">
 
        <div id="body${cnt}" class="codeblock">
 
            <div class="code-header">
 
                <div class="search-path">${h.link_to(h.literal('%s &raquo; %s' % (sr['repository'],sr['f_path'])),
 
                h.url('changeset_home',repo_name=sr['repository'],revision=sr['path']))}
 
                </div>
 
            </div>
 
            <div class="left">
 
                <div class="author">
 
                    <div class="gravatar">
 
                        <img alt="gravatar" src="${h.gravatar_url(h.email(sr['author']),20)}"/>
 
                    </div>
 
                    <span>${h.person(sr['author'])}</span><br/>
 
                    <span><a href="mailto:${h.email_or_none(sr['author'])}">${h.email_or_none(sr['author'])}</a></span><br/>
 
                </div>
 
                %if sr['message_hl']:
 
                <div class="search-code-body">
 
                    <pre>${h.literal(sr['message_hl'])}</pre>
 
                </div>
 
                %else:
 
                <div class="message">${h.urlify_commit(sr['message'], sr['repository'])}</div>
 
                %endif
 
            </div>
 
        </div>
 
    </div>
 
    %else:
 
        %if cnt == 0:
 
        <div class="table">
 
            <div id="body${cnt}" class="codeblock">
 
                <div class="error">${_('Permission denied')}</div>
 
            </div>
 
        </div>
 
        %endif
 
    %endif
 
%endfor
 
%if c.cur_query and c.formated_results:
 
<div class="pagination-wh pagination-left">
 
    ${c.formated_results.pager('$link_previous ~2~ $link_next')}
 
</div>
 
%endif
rhodecode/tests/functional/test_search.py
Show inline comments
 
@@ -27,7 +27,7 @@ class TestSearchController(TestControlle
 
        self.log_user()
 
        response = self.app.get(url(controller='search', action='index'),
 
                                {'q': 'def repo'})
 
        response.mustcontain('39 results')
 
        response.mustcontain('10 results')
 

	
 
    def test_repo_search(self):
 
        self.log_user()
 
@@ -35,3 +35,44 @@ class TestSearchController(TestControlle
 
                                {'q': 'repository:%s def test' % HG_REPO})
 

	
 
        response.mustcontain('4 results')
 

	
 
    def test_search_last(self):
 
        self.log_user()
 
        response = self.app.get(url(controller='search', action='index'),
 
                                {'q': 'last:t', 'type': 'commit'})
 

	
 
        response.mustcontain('1 results')
 

	
 
    def test_search_commit_message(self):
 
        self.log_user()
 
        response = self.app.get(url(controller='search', action='index'),
 
                                {'q': 'bother to ask where to fetch repo during tests',
 
                                 'type': 'commit'})
 

	
 
        response.mustcontain('1 results')
 
        response.mustcontain('a00c1b6f5d7a6ae678fd553a8b81d92367f7ecf1')
 

	
 
    def test_search_commit_changed_file(self):
 
        self.log_user()
 
        response = self.app.get(url(controller='search', action='index'),
 
                                {'q': 'changed:tests/utils.py',
 
                                 'type': 'commit'})
 

	
 
        response.mustcontain('a00c1b6f5d7a6ae678fd553a8b81d92367f7ecf1')
 

	
 
    def test_search_commit_added_file(self):
 
        self.log_user()
 
        response = self.app.get(url(controller='search', action='index'),
 
                                {'q': 'added:README.rst',
 
                                 'type': 'commit'})
 

	
 
        response.mustcontain('1 results')
 
        response.mustcontain('3803844fdbd3b711175fc3da9bdacfcd6d29a6fb')
 

	
 
    def test_search_author(self):
 
        self.log_user()
 
        response = self.app.get(url(controller='search', action='index'),
 
                                {'q': 'author:marcin@python-blog.com revision:0',
 
                                 'type': 'commit'})
 

	
 
        response.mustcontain('1 results')
0 comments (0 inline, 0 general)