kallithea Files · rhodecode/lib/indexers/__init_

Files @ 05528ad948c4
Branch filter:
Location: kallithea/rhodecode/lib/indexers/init.py

05528ad948c4 6.5 KiB text/x-python Show Annotation Show as Raw Download as Raw
Marcin Kuzminski
Hacking for git support,and new faster repo scan
import os
import sys
from os.path import dirname as dn, join as jn

#to get the rhodecode import
sys.path.append(dn(dn(dn(os.path.realpath(__file__)))))

from rhodecode.config.environment import load_environment
from rhodecode.model.hg import HgModel
from shutil import rmtree
from webhelpers.html.builder import escape
from vcs.utils.lazy import LazyProperty

from whoosh.analysis import RegexTokenizer, LowercaseFilter, StopFilter
from whoosh.fields import TEXT, ID, STORED, Schema, FieldType
from whoosh.index import create_in, open_dir
from whoosh.formats import Characters
from whoosh.highlight import highlight, SimpleFragmenter, HtmlFormatter

import traceback


#LOCATION WE KEEP THE INDEX
IDX_LOCATION = jn(dn(dn(dn(dn(os.path.abspath(__file__))))), 'data', 'index')

#EXTENSIONS WE WANT TO INDEX CONTENT OFF
INDEX_EXTENSIONS = ['action', 'adp', 'ashx', 'asmx', 'aspx', 'asx', 'axd', 'c',
                    'cfg', 'cfm', 'cpp', 'cs', 'css', 'diff', 'do', 'el', 'erl',
                    'h', 'htm', 'html', 'ini', 'java', 'js', 'jsp', 'jspx', 'lisp',
                    'lua', 'm', 'mako', 'ml', 'pas', 'patch', 'php', 'php3',
                    'php4', 'phtml', 'pm', 'py', 'rb', 'rst', 's', 'sh', 'sql',
                    'tpl', 'txt', 'vim', 'wss', 'xhtml', 'xml', 'xsl', 'xslt',
                    'yaws']

#CUSTOM ANALYZER wordsplit + lowercase filter
ANALYZER = RegexTokenizer(expression=r"\w+") | LowercaseFilter()


#INDEX SCHEMA DEFINITION
SCHEMA = Schema(owner=TEXT(),
                repository=TEXT(stored=True),
                path=TEXT(stored=True),
                content=FieldType(format=Characters(ANALYZER),
                             scorable=True, stored=True),
                modtime=STORED(), extension=TEXT(stored=True))


IDX_NAME = 'HG_INDEX'
FORMATTER = HtmlFormatter('span', between='\n<span class="break">...</span>\n')
FRAGMENTER = SimpleFragmenter(200)

from paste.script import command
import ConfigParser

class MakeIndex(command.Command):

    max_args = 1
    min_args = 1

    usage = "CONFIG_FILE"
    summary = "Creates index for full text search given configuration file"
    group_name = "Whoosh indexing"

    parser = command.Command.standard_parser(verbose=True)
#    parser.add_option('--repo-location',
#                      action='store',
#                      dest='repo_location',
#                      help="Specifies repositories location to index",
#                      )
    parser.add_option('-f',
                      action='store_true',
                      dest='full_index',
                      help="Specifies that index should be made full i.e"
                            " destroy old and build from scratch",
                      default=False)
    def command(self):
        config_name = self.args[0]

        p = config_name.split('/')
        if len(p) == 1:
            root = '.'
        else:
            root = '/'.join(p[:-1])
        print root
        config = ConfigParser.ConfigParser({'here':root})
        config.read(config_name)
        print dict(config.items('app:main'))['index_dir']
        index_location = dict(config.items('app:main'))['index_dir']
        #return

        #=======================================================================
        # WHOOSH DAEMON
        #=======================================================================
        from rhodecode.lib.pidlock import LockHeld, DaemonLock
        from rhodecode.lib.indexers.daemon import WhooshIndexingDaemon
        try:
            l = DaemonLock()
            WhooshIndexingDaemon(index_location=index_location)\
                .run(full_index=self.options.full_index)
            l.release()
        except LockHeld:
            sys.exit(1)


class ResultWrapper(object):
    def __init__(self, search_type, searcher, matcher, highlight_items):
        self.search_type = search_type
        self.searcher = searcher
        self.matcher = matcher
        self.highlight_items = highlight_items
        self.fragment_size = 200 / 2

    @LazyProperty
    def doc_ids(self):
        docs_id = []
        while self.matcher.is_active():
            docnum = self.matcher.id()
            chunks = [offsets for offsets in self.get_chunks()]
            docs_id.append([docnum, chunks])
            self.matcher.next()
        return docs_id

    def __str__(self):
        return '<%s at %s>' % (self.__class__.__name__, len(self.doc_ids))

    def __repr__(self):
        return self.__str__()

    def __len__(self):
        return len(self.doc_ids)

    def __iter__(self):
        """
        Allows Iteration over results,and lazy generate content

        *Requires* implementation of ``__getitem__`` method.
        """
        for docid in self.doc_ids:
            yield self.get_full_content(docid)

    def __getslice__(self, i, j):
        """
        Slicing of resultWrapper
        """
        slice = []
        for docid in self.doc_ids[i:j]:
            slice.append(self.get_full_content(docid))
        return slice


    def get_full_content(self, docid):
        res = self.searcher.stored_fields(docid[0])
        f_path = res['path'][res['path'].find(res['repository']) \
                             + len(res['repository']):].lstrip('/')

        content_short = self.get_short_content(res, docid[1])
        res.update({'content_short':content_short,
                    'content_short_hl':self.highlight(content_short),
                    'f_path':f_path})

        return res

    def get_short_content(self, res, chunks):

        return ''.join([res['content'][chunk[0]:chunk[1]] for chunk in chunks])

    def get_chunks(self):
        """
        Smart function that implements chunking the content
        but not overlap chunks so it doesn't highlight the same
        close occurrences twice.
        @param matcher:
        @param size:
        """
        memory = [(0, 0)]
        for span in self.matcher.spans():
            start = span.startchar or 0
            end = span.endchar or 0
            start_offseted = max(0, start - self.fragment_size)
            end_offseted = end + self.fragment_size

            if start_offseted < memory[-1][1]:
                start_offseted = memory[-1][1]
            memory.append((start_offseted, end_offseted,))
            yield (start_offseted, end_offseted,)

    def highlight(self, content, top=5):
        if self.search_type != 'content':
            return ''
        hl = highlight(escape(content),
                 self.highlight_items,
                 analyzer=ANALYZER,
                 fragmenter=FRAGMENTER,
                 formatter=FORMATTER,
                 top=top)
        return hl
Location: kallithea/rhodecode/lib/indexers/__init__.py

Location: kallithea/rhodecode/lib/indexers/init.py