Changeset - b6c902d88472
[Not reviewed]
beta
0 7 0
Marcin Kuzminski - 14 years ago 2012-02-16 04:41:58
marcin@python-works.com
bumbed whoosh to 2.3.X series
- fixed some html issues on search results
7 files changed with 49 insertions and 33 deletions:
0 comments (0 inline, 0 general)
requires.txt
Show inline comments
 
Pylons==1.0.0
 
Beaker==1.6.2
 
WebHelpers>=1.2
 
formencode==1.2.4
 
SQLAlchemy==0.7.4
 
Mako==0.5.0
 
pygments>=1.4
 
whoosh<1.8
 
whoosh<2.4
 
celery>=2.2.5,<2.3
 
babel
 
python-dateutil>=1.5.0,<2.0.0
 
dulwich>=0.8.0,<0.9.0
 
vcs>=0.2.3.dev
 
webob==1.0.8
 
markdown==2.0.3
 
docutils==0.8.1
 
py-bcrypt
 
mercurial>=2.1,<2.2
 
\ No newline at end of file
rhodecode/__init__.py
Show inline comments
 
@@ -23,49 +23,49 @@
 
#
 
# You should have received a copy of the GNU General Public License
 
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
import sys
 
import platform
 

	
 
VERSION = (1, 3, 0, 'beta')
 
__version__ = '.'.join((str(each) for each in VERSION[:4]))
 
__dbversion__ = 4  # defines current db version for migrations
 
__platform__ = platform.system()
 
__license__ = 'GPLv3'
 
__py_version__ = sys.version_info
 

	
 
PLATFORM_WIN = ('Windows')
 
PLATFORM_OTHERS = ('Linux', 'Darwin', 'FreeBSD', 'OpenBSD', 'SunOS')
 

	
 
requirements = [
 
    "Pylons==1.0.0",
 
    "Beaker==1.6.2",
 
    "WebHelpers>=1.2",
 
    "formencode==1.2.4",
 
    "SQLAlchemy==0.7.4",
 
    "Mako==0.5.0",
 
    "pygments>=1.4",
 
    "whoosh<1.8",
 
    "whoosh<2.4",
 
    "celery>=2.2.5,<2.3",
 
    "babel",
 
    "python-dateutil>=1.5.0,<2.0.0",
 
    "dulwich>=0.8.0,<0.9.0",
 
    "vcs>=0.2.3.dev",
 
    "webob==1.0.8",
 
    "markdown==2.0.3",
 
    "docutils==0.8.1",
 
]
 

	
 
if __py_version__ < (2, 6):
 
    requirements.append("simplejson")
 
    requirements.append("pysqlite")
 

	
 
if __platform__ in PLATFORM_WIN:
 
    requirements.append("mercurial>=2.1,<2.2")
 
else:
 
    requirements.append("py-bcrypt")
 
    requirements.append("mercurial>=2.1,<2.2")
 

	
 

	
 
try:
 
    from rhodecode.lib import get_current_revision
 
    _rev = get_current_revision()
rhodecode/controllers/search.py
Show inline comments
 
@@ -5,49 +5,49 @@
 

	
 
    Search controller for rhodecode
 

	
 
    :created_on: Aug 7, 2010
 
    :author: marcink
 
    :copyright: (C) 2010-2012 Marcin Kuzminski <marcin@python-works.com>
 
    :license: GPLv3, see COPYING for more details.
 
"""
 
# This program is free software: you can redistribute it and/or modify
 
# it under the terms of the GNU General Public License as published by
 
# the Free Software Foundation, either version 3 of the License, or
 
# (at your option) any later version.
 
#
 
# This program is distributed in the hope that it will be useful,
 
# but WITHOUT ANY WARRANTY; without even the implied warranty of
 
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 
# GNU General Public License for more details.
 
#
 
# You should have received a copy of the GNU General Public License
 
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
import logging
 
import traceback
 

	
 
from pylons.i18n.translation import _
 
from pylons import request, config, session, tmpl_context as c
 
from pylons import request, config, tmpl_context as c
 

	
 
from rhodecode.lib.auth import LoginRequired
 
from rhodecode.lib.base import BaseController, render
 
from rhodecode.lib.indexers import SCHEMA, IDX_NAME, ResultWrapper
 

	
 
from webhelpers.paginate import Page
 
from webhelpers.util import update_params
 

	
 
from whoosh.index import open_dir, EmptyIndexError
 
from whoosh.qparser import QueryParser, QueryParserError
 
from whoosh.query import Phrase, Wildcard, Term, Prefix
 

	
 
log = logging.getLogger(__name__)
 

	
 

	
 
class SearchController(BaseController):
 

	
 
    @LoginRequired()
 
    def __before__(self):
 
        super(SearchController, self).__before__()
 

	
 
    def index(self, search_repo=None):
 
        c.repo_name = search_repo
 
        c.formated_results = []
 
@@ -55,67 +55,72 @@ class SearchController(BaseController):
 
        c.cur_query = request.GET.get('q', None)
 
        c.cur_type = request.GET.get('type', 'source')
 
        c.cur_search = search_type = {'content': 'content',
 
                                      'commit': 'content',
 
                                      'path': 'path',
 
                                      'repository': 'repository'}\
 
                                      .get(c.cur_type, 'content')
 

	
 
        if c.cur_query:
 
            cur_query = c.cur_query.lower()
 

	
 
        if c.cur_query:
 
            p = int(request.params.get('page', 1))
 
            highlight_items = set()
 
            try:
 
                idx = open_dir(config['app_conf']['index_dir'],
 
                               indexname=IDX_NAME)
 
                searcher = idx.searcher()
 

	
 
                qp = QueryParser(search_type, schema=SCHEMA)
 
                if c.repo_name:
 
                    cur_query = u'repository:%s %s' % (c.repo_name, cur_query)
 
                try:
 
                    query = qp.parse(unicode(cur_query))
 

	
 
                    # extract words for highlight
 
                    if isinstance(query, Phrase):
 
                        highlight_items.update(query.words)
 
                    elif isinstance(query, Prefix):
 
                        highlight_items.add(query.text)
 
                    else:
 
                        for i in query.all_terms():
 
                            if i[0] == 'content':
 
                                highlight_items.add(i[1])
 

	
 
                    matcher = query.matcher(searcher)
 

	
 
                    log.debug(query)
 
                    log.debug(highlight_items)
 
                    results = searcher.search(query)
 
                    res_ln = len(results)
 
                    c.runtime = '%s results (%.3f seconds)' \
 
                        % (res_ln, results.runtime)
 
                    c.runtime = '%s results (%.3f seconds)' % (
 
                        res_ln, results.runtime
 
                    )
 

	
 
                    def url_generator(**kw):
 
                        return update_params("?q=%s&type=%s" \
 
                                           % (c.cur_query, c.cur_search), **kw)
 

	
 
                    c.formated_results = Page(
 
                                ResultWrapper(search_type, searcher, matcher,
 
                                              highlight_items),
 
                                page=p, item_count=res_ln,
 
                                items_per_page=10, url=url_generator)
 
                        page=p,
 
                        item_count=res_ln,
 
                        items_per_page=10,
 
                        url=url_generator
 
                    )
 

	
 
                except QueryParserError:
 
                    c.runtime = _('Invalid search query. Try quoting it.')
 
                searcher.close()
 
            except (EmptyIndexError, IOError):
 
                log.error(traceback.format_exc())
 
                log.error('Empty Index data')
 
                c.runtime = _('There is no index to search in. '
 
                              'Please run whoosh indexer')
 
            except (Exception):
 
                log.error(traceback.format_exc())
 
                c.runtime = _('An error occurred during this search operation')
 

	
 

	
 
        # Return a rendered template
 
        return render('/search/search.html')
rhodecode/lib/indexers/__init__.py
Show inline comments
 
@@ -16,80 +16,81 @@
 
# (at your option) any later version.
 
#
 
# This program is distributed in the hope that it will be useful,
 
# but WITHOUT ANY WARRANTY; without even the implied warranty of
 
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 
# GNU General Public License for more details.
 
#
 
# You should have received a copy of the GNU General Public License
 
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
import os
 
import sys
 
import traceback
 
from os.path import dirname as dn, join as jn
 

	
 
#to get the rhodecode import
 
sys.path.append(dn(dn(dn(os.path.realpath(__file__)))))
 

	
 
from string import strip
 
from shutil import rmtree
 

	
 
from whoosh.analysis import RegexTokenizer, LowercaseFilter, StopFilter
 
from whoosh.fields import TEXT, ID, STORED, Schema, FieldType
 
from whoosh.index import create_in, open_dir
 
from whoosh.formats import Characters
 
from whoosh.highlight import highlight, SimpleFragmenter, HtmlFormatter
 
from whoosh.highlight import highlight, HtmlFormatter, ContextFragmenter
 

	
 
from webhelpers.html.builder import escape
 
from sqlalchemy import engine_from_config
 
from vcs.utils.lazy import LazyProperty
 

	
 
from rhodecode.model import init_model
 
from rhodecode.model.scm import ScmModel
 
from rhodecode.model.repo import RepoModel
 
from rhodecode.config.environment import load_environment
 
from rhodecode.lib import LANGUAGES_EXTENSIONS_MAP
 
from rhodecode.lib import LANGUAGES_EXTENSIONS_MAP, LazyProperty
 
from rhodecode.lib.utils import BasePasterCommand, Command, add_cache
 

	
 
#EXTENSIONS WE WANT TO INDEX CONTENT OFF
 
INDEX_EXTENSIONS = LANGUAGES_EXTENSIONS_MAP.keys()
 

	
 
#CUSTOM ANALYZER wordsplit + lowercase filter
 
ANALYZER = RegexTokenizer(expression=r"\w+") | LowercaseFilter()
 

	
 

	
 
#INDEX SCHEMA DEFINITION
 
SCHEMA = Schema(owner=TEXT(),
 
SCHEMA = Schema(
 
    owner=TEXT(),
 
                repository=TEXT(stored=True),
 
                path=TEXT(stored=True),
 
                content=FieldType(format=Characters(ANALYZER),
 
    content=FieldType(format=Characters(), analyzer=ANALYZER,
 
                             scorable=True, stored=True),
 
                modtime=STORED(), extension=TEXT(stored=True))
 

	
 
    modtime=STORED(),
 
    extension=TEXT(stored=True)
 
)
 

	
 
IDX_NAME = 'HG_INDEX'
 
FORMATTER = HtmlFormatter('span', between='\n<span class="break">...</span>\n')
 
FRAGMENTER = SimpleFragmenter(200)
 
FRAGMENTER = ContextFragmenter(200)
 

	
 

	
 
class MakeIndex(BasePasterCommand):
 

	
 
    max_args = 1
 
    min_args = 1
 

	
 
    usage = "CONFIG_FILE"
 
    summary = "Creates index for full text search given configuration file"
 
    group_name = "RhodeCode"
 
    takes_config_file = -1
 
    parser = Command.standard_parser(verbose=True)
 

	
 
    def command(self):
 

	
 
        from pylons import config
 
        add_cache(config)
 
        engine = engine_from_config(config, 'sqlalchemy.db1.')
 
        init_model(engine)
 

	
 
        index_location = config['index_dir']
 
        repo_location = self.options.repo_location \
 
            if self.options.repo_location else RepoModel().repos_path
 
        repo_list = map(strip, self.options.repo_list.split(',')) \
 
@@ -115,112 +116,114 @@ class MakeIndex(BasePasterCommand):
 
                          action='store',
 
                          dest='repo_location',
 
                          help="Specifies repositories location to index OPTIONAL",
 
                          )
 
        self.parser.add_option('--index-only',
 
                          action='store',
 
                          dest='repo_list',
 
                          help="Specifies a comma separated list of repositores "
 
                                "to build index on OPTIONAL",
 
                          )
 
        self.parser.add_option('-f',
 
                          action='store_true',
 
                          dest='full_index',
 
                          help="Specifies that index should be made full i.e"
 
                                " destroy old and build from scratch",
 
                          default=False)
 

	
 

	
 
class ResultWrapper(object):
 
    def __init__(self, search_type, searcher, matcher, highlight_items):
 
        self.search_type = search_type
 
        self.searcher = searcher
 
        self.matcher = matcher
 
        self.highlight_items = highlight_items
 
        self.fragment_size = 200 / 2
 
        self.fragment_size = 200
 

	
 
    @LazyProperty
 
    def doc_ids(self):
 
        docs_id = []
 
        while self.matcher.is_active():
 
            docnum = self.matcher.id()
 
            chunks = [offsets for offsets in self.get_chunks()]
 
            docs_id.append([docnum, chunks])
 
            self.matcher.next()
 
        return docs_id
 

	
 
    def __str__(self):
 
        return '<%s at %s>' % (self.__class__.__name__, len(self.doc_ids))
 

	
 
    def __repr__(self):
 
        return self.__str__()
 

	
 
    def __len__(self):
 
        return len(self.doc_ids)
 

	
 
    def __iter__(self):
 
        """
 
        Allows Iteration over results,and lazy generate content
 

	
 
        *Requires* implementation of ``__getitem__`` method.
 
        """
 
        for docid in self.doc_ids:
 
            yield self.get_full_content(docid)
 

	
 
    def __getitem__(self, key):
 
        """
 
        Slicing of resultWrapper
 
        """
 
        i, j = key.start, key.stop
 

	
 
        slice = []
 
        slices = []
 
        for docid in self.doc_ids[i:j]:
 
            slice.append(self.get_full_content(docid))
 
        return slice
 
            slices.append(self.get_full_content(docid))
 
        return slices
 

	
 
    def get_full_content(self, docid):
 
        res = self.searcher.stored_fields(docid[0])
 
        f_path = res['path'][res['path'].find(res['repository']) \
 
                             + len(res['repository']):].lstrip('/')
 

	
 
        content_short = self.get_short_content(res, docid[1])
 
        res.update({'content_short':content_short,
 
                    'content_short_hl':self.highlight(content_short),
 
                    'f_path':f_path})
 

	
 
        return res
 

	
 
    def get_short_content(self, res, chunks):
 

	
 
        return ''.join([res['content'][chunk[0]:chunk[1]] for chunk in chunks])
 

	
 
    def get_chunks(self):
 
        """
 
        Smart function that implements chunking the content
 
        but not overlap chunks so it doesn't highlight the same
 
        close occurrences twice.
 

	
 
        :param matcher:
 
        :param size:
 
        """
 
        memory = [(0, 0)]
 
        for span in self.matcher.spans():
 
            start = span.startchar or 0
 
            end = span.endchar or 0
 
            start_offseted = max(0, start - self.fragment_size)
 
            end_offseted = end + self.fragment_size
 

	
 
            if start_offseted < memory[-1][1]:
 
                start_offseted = memory[-1][1]
 
            memory.append((start_offseted, end_offseted,))
 
            yield (start_offseted, end_offseted,)
 

	
 
    def highlight(self, content, top=5):
 
        if self.search_type != 'content':
 
            return ''
 
        hl = highlight(escape(content),
 
                 self.highlight_items,
 
        hl = highlight(
 
            text=escape(content),
 
            terms=self.highlight_items,
 
                 analyzer=ANALYZER,
 
                 fragmenter=FRAGMENTER,
 
                 formatter=FORMATTER,
 
                 top=top)
 
            top=top
 
        )
 
        return hl
rhodecode/lib/indexers/daemon.py
Show inline comments
 
@@ -28,95 +28,94 @@ import sys
 
import logging
 
import traceback
 

	
 
from shutil import rmtree
 
from time import mktime
 

	
 
from os.path import dirname as dn
 
from os.path import join as jn
 

	
 
#to get the rhodecode import
 
project_path = dn(dn(dn(dn(os.path.realpath(__file__)))))
 
sys.path.append(project_path)
 

	
 

	
 
from rhodecode.model.scm import ScmModel
 
from rhodecode.lib import safe_unicode
 
from rhodecode.lib.indexers import INDEX_EXTENSIONS, SCHEMA, IDX_NAME
 

	
 
from vcs.exceptions import ChangesetError, RepositoryError, \
 
    NodeDoesNotExistError
 

	
 
from whoosh.index import create_in, open_dir
 

	
 

	
 

	
 
log = logging.getLogger('whooshIndexer')
 
# create logger
 
log.setLevel(logging.DEBUG)
 
log.propagate = False
 
# create console handler and set level to debug
 
ch = logging.StreamHandler()
 
ch.setLevel(logging.DEBUG)
 

	
 
# create formatter
 
formatter = logging.Formatter("%(asctime)s - %(name)s -"
 
                              " %(levelname)s - %(message)s")
 

	
 
# add formatter to ch
 
ch.setFormatter(formatter)
 

	
 
# add ch to logger
 
log.addHandler(ch)
 

	
 

	
 
class WhooshIndexingDaemon(object):
 
    """
 
    Daemon for atomic jobs
 
    """
 

	
 
    def __init__(self, indexname='HG_INDEX', index_location=None,
 
    def __init__(self, indexname=IDX_NAME, index_location=None,
 
                 repo_location=None, sa=None, repo_list=None):
 
        self.indexname = indexname
 

	
 
        self.index_location = index_location
 
        if not index_location:
 
            raise Exception('You have to provide index location')
 

	
 
        self.repo_location = repo_location
 
        if not repo_location:
 
            raise Exception('You have to provide repositories location')
 

	
 
        self.repo_paths = ScmModel(sa).repo_scan(self.repo_location)
 

	
 
        if repo_list:
 
            filtered_repo_paths = {}
 
            for repo_name, repo in self.repo_paths.items():
 
                if repo_name in repo_list:
 
                    filtered_repo_paths[repo_name] = repo
 

	
 
            self.repo_paths = filtered_repo_paths
 

	
 

	
 
        self.initial = False
 
        if not os.path.isdir(self.index_location):
 
            os.makedirs(self.index_location)
 
            log.info('Cannot run incremental index since it does not'
 
                     ' yet exist running full build')
 
            self.initial = True
 

	
 
    def get_paths(self, repo):
 
        """recursive walk in root dir and return a set of all path in that dir
 
        based on repository walk function
 
        """
 
        index_paths_ = set()
 
        try:
 
            tip = repo.get_changeset('tip')
 
            for topnode, dirs, files in tip.walk('/'):
 
                for f in files:
 
                    index_paths_.add(jn(repo.path, f.path))
 

	
 
        except RepositoryError, e:
 
            log.debug(traceback.format_exc())
 
            pass
 
        return index_paths_
 

	
 
    def get_node(self, repo, path):
 
@@ -134,71 +133,69 @@ class WhooshIndexingDaemon(object):
 

	
 
        #we just index the content of chosen files, and skip binary files
 
        if node.extension in INDEX_EXTENSIONS and not node.is_binary:
 

	
 
            u_content = node.content
 
            if not isinstance(u_content, unicode):
 
                log.warning('  >> %s Could not get this content as unicode '
 
                          'replacing with empty content', path)
 
                u_content = u''
 
            else:
 
                log.debug('    >> %s [WITH CONTENT]' % path)
 

	
 
        else:
 
            log.debug('    >> %s' % path)
 
            #just index file name without it's content
 
            u_content = u''
 

	
 
        writer.add_document(owner=unicode(repo.contact),
 
                        repository=safe_unicode(repo_name),
 
                        path=safe_unicode(path),
 
                        content=u_content,
 
                        modtime=self.get_node_mtime(node),
 
                        extension=node.extension)
 

	
 

	
 
    def build_index(self):
 
        if os.path.exists(self.index_location):
 
            log.debug('removing previous index')
 
            rmtree(self.index_location)
 

	
 
        if not os.path.exists(self.index_location):
 
            os.mkdir(self.index_location)
 

	
 
        idx = create_in(self.index_location, SCHEMA, indexname=IDX_NAME)
 
        writer = idx.writer()
 

	
 
        for repo_name, repo in self.repo_paths.items():
 
            log.debug('building index @ %s' % repo.path)
 

	
 
            for idx_path in self.get_paths(repo):
 
                self.add_doc(writer, idx_path, repo, repo_name)
 

	
 
        log.debug('>> COMMITING CHANGES <<')
 
        writer.commit(merge=True)
 
        log.debug('>>> FINISHED BUILDING INDEX <<<')
 

	
 

	
 
    def update_index(self):
 
        log.debug('STARTING INCREMENTAL INDEXING UPDATE')
 

	
 
        idx = open_dir(self.index_location, indexname=self.indexname)
 
        # The set of all paths in the index
 
        indexed_paths = set()
 
        # The set of all paths we need to re-index
 
        to_index = set()
 

	
 
        reader = idx.reader()
 
        writer = idx.writer()
 

	
 
        # Loop over the stored fields in the index
 
        for fields in reader.all_stored_fields():
 
            indexed_path = fields['path']
 
            indexed_paths.add(indexed_path)
 

	
 
            repo = self.repo_paths[fields['repository']]
 

	
 
            try:
 
                node = self.get_node(repo, indexed_path)
 
            except (ChangesetError, NodeDoesNotExistError):
 
                # This file was deleted since it was indexed
 
                log.debug('removing from index %s' % indexed_path)
rhodecode/public/css/pygments.css
Show inline comments
 
@@ -44,52 +44,62 @@ div.codeblock .code-header .stats .butto
 
div.codeblock .code-header .author{
 
	margin-left:25px;
 
	font-weight: bold;
 
	height: 25px;
 
}
 
div.codeblock .code-header .author .user{
 
	padding-top:3px;
 
}
 
div.codeblock .code-header .commit{
 
	margin-left:25px;
 
	font-weight: normal;
 
	white-space:pre;
 
}
 

	
 
div.codeblock .code-body table{
 
    width: 0 !important;
 
    border: 0px !important;
 
}
 
div.codeblock .code-body table td {
 
	border: 0px !important;
 
}
 
div.code-body {
 
	background-color: #FFFFFF;
 
}
 
div.code-body pre .match{
 

	
 
div.codeblock .code-header .search-path {
 
	padding: 0px 0px 0px 10px;
 
}
 

	
 
div.search-code-body {
 
    background-color: #FFFFFF;
 
    padding: 5px 0px 5px 10px;
 
}
 

	
 
div.search-code-body pre .match{
 
	background-color: #FAFFA6;
 
}
 
div.code-body pre .break{
 
div.search-code-body pre .break{
 
	background-color: #DDE7EF;
 
	width: 100%;
 
	color: #747474;
 
	display: block;
 
	
 
}
 
div.annotatediv{
 
	margin-left:2px;
 
	margin-right:4px;
 
}
 
.code-highlight {
 
    padding: 0px;
 
    margin-top: 5px;
 
    margin-bottom: 5px;
 
    border-left: 2px solid #ccc;
 
}
 
.code-highlight pre, .linenodiv pre { 
 
	padding: 5px;
 
    margin: 0;
 
}
 
.code-highlight pre div:target { 
 
    background-color: #FFFFBE !important;
 
}
 
    
rhodecode/templates/search/search_content.html
Show inline comments
 
##content highligthing
 

	
 
%for cnt,sr in enumerate(c.formated_results):
 
    %if h.HasRepoPermissionAny('repository.write','repository.read','repository.admin')(sr['repository'],'search results check'):
 
    <div class="table">
 
        <div id="body${cnt}" class="codeblock">
 
            <div class="code-header">
 
                <div class="revision">${h.link_to(h.literal('%s &raquo; %s' % (sr['repository'],sr['f_path'])),
 
                h.url('files_home',repo_name=sr['repository'],revision='tip',f_path=sr['f_path']))}</div>
 
                <div class="search-path">${h.link_to(h.literal('%s &raquo; %s' % (sr['repository'],sr['f_path'])),
 
                h.url('files_home',repo_name=sr['repository'],revision='tip',f_path=sr['f_path']))}
 
            </div>
 
            <div class="code-body">
 
            </div>
 
            <div class="search-code-body">
 
                <pre>${h.literal(sr['content_short_hl'])}</pre>
 
            </div>
 
        </div>
 
    </div>
 
    %else:
 
        %if cnt == 0:
 
        <div class="table">
 
            <div id="body${cnt}" class="codeblock">
 
                <div class="error">${_('Permission denied')}</div>
 
            </div>
 
        </div>
 
        %endif
 

	
 
    %endif
 
%endfor
 
%if c.cur_query and c.formated_results:
 
<div class="pagination-wh pagination-left">
 
    ${c.formated_results.pager('$link_previous ~2~ $link_next')}
 
</div>
 
%endif
0 comments (0 inline, 0 general)