Changeset - b6c902d88472
[Not reviewed]
beta
0 7 0
Marcin Kuzminski - 14 years ago 2012-02-16 04:41:58
marcin@python-works.com
bumbed whoosh to 2.3.X series
- fixed some html issues on search results
7 files changed with 62 insertions and 46 deletions:
0 comments (0 inline, 0 general)
requires.txt
Show inline comments
 
Pylons==1.0.0
 
Beaker==1.6.2
 
WebHelpers>=1.2
 
formencode==1.2.4
 
SQLAlchemy==0.7.4
 
Mako==0.5.0
 
pygments>=1.4
 
whoosh<1.8
 
whoosh<2.4
 
celery>=2.2.5,<2.3
 
babel
 
python-dateutil>=1.5.0,<2.0.0
 
dulwich>=0.8.0,<0.9.0
 
vcs>=0.2.3.dev
 
webob==1.0.8
 
markdown==2.0.3
 
docutils==0.8.1
 
py-bcrypt
 
mercurial>=2.1,<2.2
 
\ No newline at end of file
rhodecode/__init__.py
Show inline comments
 
@@ -35,25 +35,25 @@ __py_version__ = sys.version_info
 

	
 
PLATFORM_WIN = ('Windows')
 
PLATFORM_OTHERS = ('Linux', 'Darwin', 'FreeBSD', 'OpenBSD', 'SunOS')
 

	
 
requirements = [
 
    "Pylons==1.0.0",
 
    "Beaker==1.6.2",
 
    "WebHelpers>=1.2",
 
    "formencode==1.2.4",
 
    "SQLAlchemy==0.7.4",
 
    "Mako==0.5.0",
 
    "pygments>=1.4",
 
    "whoosh<1.8",
 
    "whoosh<2.4",
 
    "celery>=2.2.5,<2.3",
 
    "babel",
 
    "python-dateutil>=1.5.0,<2.0.0",
 
    "dulwich>=0.8.0,<0.9.0",
 
    "vcs>=0.2.3.dev",
 
    "webob==1.0.8",
 
    "markdown==2.0.3",
 
    "docutils==0.8.1",
 
]
 

	
 
if __py_version__ < (2, 6):
 
    requirements.append("simplejson")
rhodecode/controllers/search.py
Show inline comments
 
@@ -17,25 +17,25 @@
 
#
 
# This program is distributed in the hope that it will be useful,
 
# but WITHOUT ANY WARRANTY; without even the implied warranty of
 
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 
# GNU General Public License for more details.
 
#
 
# You should have received a copy of the GNU General Public License
 
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
import logging
 
import traceback
 

	
 
from pylons.i18n.translation import _
 
from pylons import request, config, session, tmpl_context as c
 
from pylons import request, config, tmpl_context as c
 

	
 
from rhodecode.lib.auth import LoginRequired
 
from rhodecode.lib.base import BaseController, render
 
from rhodecode.lib.indexers import SCHEMA, IDX_NAME, ResultWrapper
 

	
 
from webhelpers.paginate import Page
 
from webhelpers.util import update_params
 

	
 
from whoosh.index import open_dir, EmptyIndexError
 
from whoosh.qparser import QueryParser, QueryParserError
 
from whoosh.query import Phrase, Wildcard, Term, Prefix
 

	
 
@@ -67,55 +67,60 @@ class SearchController(BaseController):
 
            p = int(request.params.get('page', 1))
 
            highlight_items = set()
 
            try:
 
                idx = open_dir(config['app_conf']['index_dir'],
 
                               indexname=IDX_NAME)
 
                searcher = idx.searcher()
 

	
 
                qp = QueryParser(search_type, schema=SCHEMA)
 
                if c.repo_name:
 
                    cur_query = u'repository:%s %s' % (c.repo_name, cur_query)
 
                try:
 
                    query = qp.parse(unicode(cur_query))
 

	
 
                    # extract words for highlight
 
                    if isinstance(query, Phrase):
 
                        highlight_items.update(query.words)
 
                    elif isinstance(query, Prefix):
 
                        highlight_items.add(query.text)
 
                    else:
 
                        for i in query.all_terms():
 
                            if i[0] == 'content':
 
                                highlight_items.add(i[1])
 

	
 
                    matcher = query.matcher(searcher)
 

	
 
                    log.debug(query)
 
                    log.debug(highlight_items)
 
                    results = searcher.search(query)
 
                    res_ln = len(results)
 
                    c.runtime = '%s results (%.3f seconds)' \
 
                        % (res_ln, results.runtime)
 
                    c.runtime = '%s results (%.3f seconds)' % (
 
                        res_ln, results.runtime
 
                    )
 

	
 
                    def url_generator(**kw):
 
                        return update_params("?q=%s&type=%s" \
 
                                           % (c.cur_query, c.cur_search), **kw)
 

	
 
                    c.formated_results = Page(
 
                                ResultWrapper(search_type, searcher, matcher,
 
                                              highlight_items),
 
                                page=p, item_count=res_ln,
 
                                items_per_page=10, url=url_generator)
 
                        ResultWrapper(search_type, searcher, matcher,
 
                                      highlight_items),
 
                        page=p,
 
                        item_count=res_ln,
 
                        items_per_page=10,
 
                        url=url_generator
 
                    )
 

	
 
                except QueryParserError:
 
                    c.runtime = _('Invalid search query. Try quoting it.')
 
                searcher.close()
 
            except (EmptyIndexError, IOError):
 
                log.error(traceback.format_exc())
 
                log.error('Empty Index data')
 
                c.runtime = _('There is no index to search in. '
 
                              'Please run whoosh indexer')
 
            except (Exception):
 
                log.error(traceback.format_exc())
 
                c.runtime = _('An error occurred during this search operation')
 

	
 

	
 
        # Return a rendered template
 
        return render('/search/search.html')
rhodecode/lib/indexers/__init__.py
Show inline comments
 
@@ -28,56 +28,57 @@ import traceback
 
from os.path import dirname as dn, join as jn
 

	
 
#to get the rhodecode import
 
sys.path.append(dn(dn(dn(os.path.realpath(__file__)))))
 

	
 
from string import strip
 
from shutil import rmtree
 

	
 
from whoosh.analysis import RegexTokenizer, LowercaseFilter, StopFilter
 
from whoosh.fields import TEXT, ID, STORED, Schema, FieldType
 
from whoosh.index import create_in, open_dir
 
from whoosh.formats import Characters
 
from whoosh.highlight import highlight, SimpleFragmenter, HtmlFormatter
 
from whoosh.highlight import highlight, HtmlFormatter, ContextFragmenter
 

	
 
from webhelpers.html.builder import escape
 
from sqlalchemy import engine_from_config
 
from vcs.utils.lazy import LazyProperty
 

	
 
from rhodecode.model import init_model
 
from rhodecode.model.scm import ScmModel
 
from rhodecode.model.repo import RepoModel
 
from rhodecode.config.environment import load_environment
 
from rhodecode.lib import LANGUAGES_EXTENSIONS_MAP
 
from rhodecode.lib import LANGUAGES_EXTENSIONS_MAP, LazyProperty
 
from rhodecode.lib.utils import BasePasterCommand, Command, add_cache
 

	
 
#EXTENSIONS WE WANT TO INDEX CONTENT OFF
 
# EXTENSIONS WE WANT TO INDEX CONTENT OFF
 
INDEX_EXTENSIONS = LANGUAGES_EXTENSIONS_MAP.keys()
 

	
 
#CUSTOM ANALYZER wordsplit + lowercase filter
 
# CUSTOM ANALYZER wordsplit + lowercase filter
 
ANALYZER = RegexTokenizer(expression=r"\w+") | LowercaseFilter()
 

	
 

	
 
#INDEX SCHEMA DEFINITION
 
SCHEMA = Schema(owner=TEXT(),
 
                repository=TEXT(stored=True),
 
                path=TEXT(stored=True),
 
                content=FieldType(format=Characters(ANALYZER),
 
                             scorable=True, stored=True),
 
                modtime=STORED(), extension=TEXT(stored=True))
 

	
 
SCHEMA = Schema(
 
    owner=TEXT(),
 
    repository=TEXT(stored=True),
 
    path=TEXT(stored=True),
 
    content=FieldType(format=Characters(), analyzer=ANALYZER,
 
                      scorable=True, stored=True),
 
    modtime=STORED(),
 
    extension=TEXT(stored=True)
 
)
 

	
 
IDX_NAME = 'HG_INDEX'
 
FORMATTER = HtmlFormatter('span', between='\n<span class="break">...</span>\n')
 
FRAGMENTER = SimpleFragmenter(200)
 
FRAGMENTER = ContextFragmenter(200)
 

	
 

	
 
class MakeIndex(BasePasterCommand):
 

	
 
    max_args = 1
 
    min_args = 1
 

	
 
    usage = "CONFIG_FILE"
 
    summary = "Creates index for full text search given configuration file"
 
    group_name = "RhodeCode"
 
    takes_config_file = -1
 
    parser = Command.standard_parser(verbose=True)
 
@@ -127,25 +128,25 @@ class MakeIndex(BasePasterCommand):
 
                          dest='full_index',
 
                          help="Specifies that index should be made full i.e"
 
                                " destroy old and build from scratch",
 
                          default=False)
 

	
 

	
 
class ResultWrapper(object):
 
    def __init__(self, search_type, searcher, matcher, highlight_items):
 
        self.search_type = search_type
 
        self.searcher = searcher
 
        self.matcher = matcher
 
        self.highlight_items = highlight_items
 
        self.fragment_size = 200 / 2
 
        self.fragment_size = 200
 

	
 
    @LazyProperty
 
    def doc_ids(self):
 
        docs_id = []
 
        while self.matcher.is_active():
 
            docnum = self.matcher.id()
 
            chunks = [offsets for offsets in self.get_chunks()]
 
            docs_id.append([docnum, chunks])
 
            self.matcher.next()
 
        return docs_id
 

	
 
    def __str__(self):
 
@@ -163,38 +164,38 @@ class ResultWrapper(object):
 

	
 
        *Requires* implementation of ``__getitem__`` method.
 
        """
 
        for docid in self.doc_ids:
 
            yield self.get_full_content(docid)
 

	
 
    def __getitem__(self, key):
 
        """
 
        Slicing of resultWrapper
 
        """
 
        i, j = key.start, key.stop
 

	
 
        slice = []
 
        slices = []
 
        for docid in self.doc_ids[i:j]:
 
            slice.append(self.get_full_content(docid))
 
        return slice
 
            slices.append(self.get_full_content(docid))
 
        return slices
 

	
 
    def get_full_content(self, docid):
 
        res = self.searcher.stored_fields(docid[0])
 
        f_path = res['path'][res['path'].find(res['repository']) \
 
                             + len(res['repository']):].lstrip('/')
 

	
 
        content_short = self.get_short_content(res, docid[1])
 
        res.update({'content_short':content_short,
 
                    'content_short_hl':self.highlight(content_short),
 
                    'f_path':f_path})
 
        res.update({'content_short': content_short,
 
                    'content_short_hl': self.highlight(content_short),
 
                    'f_path': f_path})
 

	
 
        return res
 

	
 
    def get_short_content(self, res, chunks):
 

	
 
        return ''.join([res['content'][chunk[0]:chunk[1]] for chunk in chunks])
 

	
 
    def get_chunks(self):
 
        """
 
        Smart function that implements chunking the content
 
        but not overlap chunks so it doesn't highlight the same
 
        close occurrences twice.
 
@@ -208,19 +209,21 @@ class ResultWrapper(object):
 
            end = span.endchar or 0
 
            start_offseted = max(0, start - self.fragment_size)
 
            end_offseted = end + self.fragment_size
 

	
 
            if start_offseted < memory[-1][1]:
 
                start_offseted = memory[-1][1]
 
            memory.append((start_offseted, end_offseted,))
 
            yield (start_offseted, end_offseted,)
 

	
 
    def highlight(self, content, top=5):
 
        if self.search_type != 'content':
 
            return ''
 
        hl = highlight(escape(content),
 
                 self.highlight_items,
 
                 analyzer=ANALYZER,
 
                 fragmenter=FRAGMENTER,
 
                 formatter=FORMATTER,
 
                 top=top)
 
        hl = highlight(
 
            text=escape(content),
 
            terms=self.highlight_items,
 
            analyzer=ANALYZER,
 
            fragmenter=FRAGMENTER,
 
            formatter=FORMATTER,
 
            top=top
 
        )
 
        return hl
rhodecode/lib/indexers/daemon.py
Show inline comments
 
@@ -40,71 +40,70 @@ sys.path.append(project_path)
 

	
 

	
 
from rhodecode.model.scm import ScmModel
 
from rhodecode.lib import safe_unicode
 
from rhodecode.lib.indexers import INDEX_EXTENSIONS, SCHEMA, IDX_NAME
 

	
 
from vcs.exceptions import ChangesetError, RepositoryError, \
 
    NodeDoesNotExistError
 

	
 
from whoosh.index import create_in, open_dir
 

	
 

	
 

	
 
log = logging.getLogger('whooshIndexer')
 
# create logger
 
log.setLevel(logging.DEBUG)
 
log.propagate = False
 
# create console handler and set level to debug
 
ch = logging.StreamHandler()
 
ch.setLevel(logging.DEBUG)
 

	
 
# create formatter
 
formatter = logging.Formatter("%(asctime)s - %(name)s -"
 
                              " %(levelname)s - %(message)s")
 

	
 
# add formatter to ch
 
ch.setFormatter(formatter)
 

	
 
# add ch to logger
 
log.addHandler(ch)
 

	
 

	
 
class WhooshIndexingDaemon(object):
 
    """
 
    Daemon for atomic jobs
 
    """
 

	
 
    def __init__(self, indexname='HG_INDEX', index_location=None,
 
    def __init__(self, indexname=IDX_NAME, index_location=None,
 
                 repo_location=None, sa=None, repo_list=None):
 
        self.indexname = indexname
 

	
 
        self.index_location = index_location
 
        if not index_location:
 
            raise Exception('You have to provide index location')
 

	
 
        self.repo_location = repo_location
 
        if not repo_location:
 
            raise Exception('You have to provide repositories location')
 

	
 
        self.repo_paths = ScmModel(sa).repo_scan(self.repo_location)
 

	
 
        if repo_list:
 
            filtered_repo_paths = {}
 
            for repo_name, repo in self.repo_paths.items():
 
                if repo_name in repo_list:
 
                    filtered_repo_paths[repo_name] = repo
 

	
 
            self.repo_paths = filtered_repo_paths
 

	
 

	
 
        self.initial = False
 
        if not os.path.isdir(self.index_location):
 
            os.makedirs(self.index_location)
 
            log.info('Cannot run incremental index since it does not'
 
                     ' yet exist running full build')
 
            self.initial = True
 

	
 
    def get_paths(self, repo):
 
        """recursive walk in root dir and return a set of all path in that dir
 
        based on repository walk function
 
        """
 
        index_paths_ = set()
 
@@ -146,47 +145,45 @@ class WhooshIndexingDaemon(object):
 
        else:
 
            log.debug('    >> %s' % path)
 
            #just index file name without it's content
 
            u_content = u''
 

	
 
        writer.add_document(owner=unicode(repo.contact),
 
                        repository=safe_unicode(repo_name),
 
                        path=safe_unicode(path),
 
                        content=u_content,
 
                        modtime=self.get_node_mtime(node),
 
                        extension=node.extension)
 

	
 

	
 
    def build_index(self):
 
        if os.path.exists(self.index_location):
 
            log.debug('removing previous index')
 
            rmtree(self.index_location)
 

	
 
        if not os.path.exists(self.index_location):
 
            os.mkdir(self.index_location)
 

	
 
        idx = create_in(self.index_location, SCHEMA, indexname=IDX_NAME)
 
        writer = idx.writer()
 

	
 
        for repo_name, repo in self.repo_paths.items():
 
            log.debug('building index @ %s' % repo.path)
 

	
 
            for idx_path in self.get_paths(repo):
 
                self.add_doc(writer, idx_path, repo, repo_name)
 

	
 
        log.debug('>> COMMITING CHANGES <<')
 
        writer.commit(merge=True)
 
        log.debug('>>> FINISHED BUILDING INDEX <<<')
 

	
 

	
 
    def update_index(self):
 
        log.debug('STARTING INCREMENTAL INDEXING UPDATE')
 

	
 
        idx = open_dir(self.index_location, indexname=self.indexname)
 
        # The set of all paths in the index
 
        indexed_paths = set()
 
        # The set of all paths we need to re-index
 
        to_index = set()
 

	
 
        reader = idx.reader()
 
        writer = idx.writer()
 

	
rhodecode/public/css/pygments.css
Show inline comments
 
@@ -56,28 +56,38 @@ div.codeblock .code-header .commit{
 
}
 

	
 
div.codeblock .code-body table{
 
    width: 0 !important;
 
    border: 0px !important;
 
}
 
div.codeblock .code-body table td {
 
	border: 0px !important;
 
}
 
div.code-body {
 
	background-color: #FFFFFF;
 
}
 
div.code-body pre .match{
 

	
 
div.codeblock .code-header .search-path {
 
	padding: 0px 0px 0px 10px;
 
}
 

	
 
div.search-code-body {
 
    background-color: #FFFFFF;
 
    padding: 5px 0px 5px 10px;
 
}
 

	
 
div.search-code-body pre .match{
 
	background-color: #FAFFA6;
 
}
 
div.code-body pre .break{
 
div.search-code-body pre .break{
 
	background-color: #DDE7EF;
 
	width: 100%;
 
	color: #747474;
 
	display: block;
 
	
 
}
 
div.annotatediv{
 
	margin-left:2px;
 
	margin-right:4px;
 
}
 
.code-highlight {
 
    padding: 0px;
rhodecode/templates/search/search_content.html
Show inline comments
 
##content highligthing
 

	
 
%for cnt,sr in enumerate(c.formated_results):
 
    %if h.HasRepoPermissionAny('repository.write','repository.read','repository.admin')(sr['repository'],'search results check'):
 
    <div class="table">
 
        <div id="body${cnt}" class="codeblock">
 
            <div class="code-header">
 
                <div class="revision">${h.link_to(h.literal('%s &raquo; %s' % (sr['repository'],sr['f_path'])),
 
                h.url('files_home',repo_name=sr['repository'],revision='tip',f_path=sr['f_path']))}</div>
 
                <div class="search-path">${h.link_to(h.literal('%s &raquo; %s' % (sr['repository'],sr['f_path'])),
 
                h.url('files_home',repo_name=sr['repository'],revision='tip',f_path=sr['f_path']))}
 
                </div>
 
            </div>
 
            <div class="code-body">
 
            <div class="search-code-body">
 
                <pre>${h.literal(sr['content_short_hl'])}</pre>
 
            </div>
 
        </div>
 
    </div>
 
    %else:
 
        %if cnt == 0:
 
        <div class="table">
 
            <div id="body${cnt}" class="codeblock">
 
                <div class="error">${_('Permission denied')}</div>
 
            </div>
 
        </div>
 
        %endif
0 comments (0 inline, 0 general)