Changeset - 2ff913970025
[Not reviewed]
default
0 2 0
FUJIWARA Katsunori - 9 years ago 2017-01-22 18:17:38
foozy@lares.dti.ne.jp
journal: make "username:" filtering condition work as expected

As described in previous revision, using TEXT in JOURNAL_SCHEMA causes
unexpected results for "username:", too.

- tokenization by non-alphanumeric characters
- removing "stop words"

To make "username:" filtering condition work as expected, this
revision uses ID instead of TEST for "username" of JOURNAL_COLUMN.
2 files changed with 3 insertions and 3 deletions:
0 comments (0 inline, 0 general)
kallithea/lib/indexers/__init__.py
Show inline comments
 
# -*- coding: utf-8 -*-
 
# This program is free software: you can redistribute it and/or modify
 
# it under the terms of the GNU General Public License as published by
 
# the Free Software Foundation, either version 3 of the License, or
 
# (at your option) any later version.
 
#
 
# This program is distributed in the hope that it will be useful,
 
# but WITHOUT ANY WARRANTY; without even the implied warranty of
 
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 
# GNU General Public License for more details.
 
#
 
# You should have received a copy of the GNU General Public License
 
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
"""
 
kallithea.lib.indexers
 
~~~~~~~~~~~~~~~~~~~~~~
 

	
 
Whoosh indexing module for Kallithea
 

	
 
This file was forked by the Kallithea project in July 2014.
 
Original author and date, and relevant copyright and licensing information is below:
 
:created_on: Aug 17, 2010
 
:author: marcink
 
:copyright: (c) 2013 RhodeCode GmbH, and others.
 
:license: GPLv3, see LICENSE.md for more details.
 
"""
 

	
 
import os
 
import sys
 
import logging
 
from os.path import dirname
 

	
 
# Add location of top level folder to sys.path
 
sys.path.append(dirname(dirname(dirname(os.path.realpath(__file__)))))
 

	
 
from whoosh.analysis import RegexTokenizer, LowercaseFilter
 
from whoosh.fields import TEXT, ID, STORED, NUMERIC, BOOLEAN, Schema, FieldType, DATETIME
 
from whoosh.formats import Characters
 
from whoosh.highlight import highlight as whoosh_highlight, HtmlFormatter, ContextFragmenter
 
from kallithea.lib.utils2 import LazyProperty
 

	
 
log = logging.getLogger(__name__)
 

	
 
# CUSTOM ANALYZER wordsplit + lowercase filter
 
ANALYZER = RegexTokenizer(expression=r"\w+") | LowercaseFilter()
 

	
 
#INDEX SCHEMA DEFINITION
 
SCHEMA = Schema(
 
    fileid=ID(unique=True),
 
    owner=TEXT(),
 
    repository=TEXT(stored=True),
 
    path=TEXT(stored=True),
 
    content=FieldType(format=Characters(), analyzer=ANALYZER,
 
                      scorable=True, stored=True),
 
    modtime=STORED(),
 
    extension=TEXT(stored=True)
 
)
 

	
 
IDX_NAME = 'HG_INDEX'
 
FORMATTER = HtmlFormatter('span', between='\n<span class="break">...</span>\n')
 
FRAGMENTER = ContextFragmenter(200)
 

	
 
CHGSETS_SCHEMA = Schema(
 
    raw_id=ID(unique=True, stored=True),
 
    date=NUMERIC(stored=True),
 
    last=BOOLEAN(),
 
    owner=TEXT(),
 
    repository=ID(unique=True, stored=True),
 
    author=TEXT(stored=True),
 
    message=FieldType(format=Characters(), analyzer=ANALYZER,
 
                      scorable=True, stored=True),
 
    parents=TEXT(),
 
    added=TEXT(),
 
    removed=TEXT(),
 
    changed=TEXT(),
 
)
 

	
 
CHGSET_IDX_NAME = 'CHGSET_INDEX'
 

	
 
# used only to generate queries in journal
 
JOURNAL_SCHEMA = Schema(
 
    username=TEXT(),
 
    username=ID(),
 
    date=DATETIME(),
 
    action=TEXT(),
 
    repository=ID(),
 
    ip=TEXT(),
 
)
 

	
 

	
 
class WhooshResultWrapper(object):
 
    def __init__(self, search_type, searcher, matcher, highlight_items,
 
                 repo_location):
 
        self.search_type = search_type
 
        self.searcher = searcher
 
        self.matcher = matcher
 
        self.highlight_items = highlight_items
 
        self.fragment_size = 200
 
        self.repo_location = repo_location
 

	
 
    @LazyProperty
 
    def doc_ids(self):
 
        docs_id = []
 
        while self.matcher.is_active():
 
            docnum = self.matcher.id()
 
            chunks = [offsets for offsets in self.get_chunks()]
 
            docs_id.append([docnum, chunks])
 
            self.matcher.next()
 
        return docs_id
 

	
 
    def __str__(self):
 
        return '<%s at %s>' % (self.__class__.__name__, len(self.doc_ids))
 

	
 
    def __repr__(self):
 
        return self.__str__()
 

	
 
    def __len__(self):
 
        return len(self.doc_ids)
 

	
 
    def __iter__(self):
 
        """
 
        Allows Iteration over results,and lazy generate content
 

	
 
        *Requires* implementation of ``__getitem__`` method.
 
        """
 
        for docid in self.doc_ids:
 
            yield self.get_full_content(docid)
 

	
 
    def __getitem__(self, key):
 
        """
 
        Slicing of resultWrapper
 
        """
 
        i, j = key.start, key.stop
 

	
 
        slices = []
 
        for docid in self.doc_ids[i:j]:
 
            slices.append(self.get_full_content(docid))
 
        return slices
 

	
 
    def get_full_content(self, docid):
 
        res = self.searcher.stored_fields(docid[0])
 
        log.debug('result: %s', res)
 
        if self.search_type == 'content':
 
            full_repo_path = os.path.join(self.repo_location, res['repository'])
 
            f_path = res['path'].split(full_repo_path)[-1]
 
            f_path = f_path.lstrip(os.sep)
 
            content_short = self.get_short_content(res, docid[1])
 
            res.update({'content_short': content_short,
 
                        'content_short_hl': self.highlight(content_short),
 
                        'f_path': f_path
 
            })
 
        elif self.search_type == 'path':
 
            full_repo_path = os.path.join(self.repo_location, res['repository'])
 
            f_path = res['path'].split(full_repo_path)[-1]
 
            f_path = f_path.lstrip(os.sep)
 
            res.update({'f_path': f_path})
 
        elif self.search_type == 'message':
 
            res.update({'message_hl': self.highlight(res['message'])})
 

	
 
        log.debug('result: %s', res)
 

	
 
        return res
 

	
 
    def get_short_content(self, res, chunks):
 

	
 
        return ''.join([res['content'][chunk[0]:chunk[1]] for chunk in chunks])
 

	
 
    def get_chunks(self):
 
        """
 
        Smart function that implements chunking the content
 
        but not overlap chunks so it doesn't highlight the same
 
        close occurrences twice.
 
        """
 
        memory = [(0, 0)]
 
        if self.matcher.supports('positions'):
 
            for span in self.matcher.spans():
 
                start = span.startchar or 0
 
                end = span.endchar or 0
 
                start_offseted = max(0, start - self.fragment_size)
 
                end_offseted = end + self.fragment_size
 

	
 
                if start_offseted < memory[-1][1]:
 
                    start_offseted = memory[-1][1]
 
                memory.append((start_offseted, end_offseted,))
 
                yield (start_offseted, end_offseted,)
 

	
 
    def highlight(self, content, top=5):
 
        if self.search_type not in ['content', 'message']:
 
            return ''
 
        hl = whoosh_highlight(
 
            text=content,
 
            terms=self.highlight_items,
 
            analyzer=ANALYZER,
 
            fragmenter=FRAGMENTER,
 
            formatter=FORMATTER,
 
            top=top
 
        )
 
        return hl
kallithea/tests/functional/test_admin.py
Show inline comments
 
import os
 
import csv
 
import datetime
 
from kallithea.tests.base import *
 
from kallithea.model.db import UserLog
 
from kallithea.model.meta import Session
 
from kallithea.lib.utils2 import safe_unicode
 
from os.path import dirname
 

	
 
FIXTURES = os.path.join(dirname(dirname(os.path.abspath(__file__))), 'fixtures')
 

	
 

	
 
class TestAdminController(TestController):
 

	
 
    @classmethod
 
    def setup_class(cls):
 
        UserLog.query().delete()
 
        Session().commit()
 

	
 
        def strptime(val):
 
            fmt = '%Y-%m-%d %H:%M:%S'
 
            if '.' not in val:
 
                return datetime.datetime.strptime(val, fmt)
 

	
 
            nofrag, frag = val.split(".")
 
            date = datetime.datetime.strptime(nofrag, fmt)
 

	
 
            frag = frag[:6]  # truncate to microseconds
 
            frag += (6 - len(frag)) * '0'  # add 0s
 
            return date.replace(microsecond=int(frag))
 

	
 
        with open(os.path.join(FIXTURES, 'journal_dump.csv')) as f:
 
            for row in csv.DictReader(f):
 
                ul = UserLog()
 
                for k, v in row.iteritems():
 
                    v = safe_unicode(v)
 
                    if k == 'action_date':
 
                        v = strptime(v)
 
                    if k in ['user_id', 'repository_id']:
 
                        # nullable due to FK problems
 
                        v = None
 
                    setattr(ul, k, v)
 
                Session().add(ul)
 
            Session().commit()
 

	
 
    @classmethod
 
    def teardown_class(cls):
 
        UserLog.query().delete()
 
        Session().commit()
 

	
 
    def test_index(self):
 
        self.log_user()
 
        response = self.app.get(url(controller='admin/admin', action='index'))
 
        response.mustcontain('Admin Journal')
 

	
 
    def test_filter_all_entries(self):
 
        self.log_user()
 
        response = self.app.get(url(controller='admin/admin', action='index',))
 
        response.mustcontain(' 2036 Entries')
 

	
 
    def test_filter_journal_filter_exact_match_on_repository(self):
 
        self.log_user()
 
        response = self.app.get(url(controller='admin/admin', action='index',
 
                                    filter='repository:xxx'))
 
        response.mustcontain(' 3 Entries')
 

	
 
    def test_filter_journal_filter_exact_match_on_repository_CamelCase(self):
 
        self.log_user()
 
        response = self.app.get(url(controller='admin/admin', action='index',
 
                                    filter='repository:XxX'))
 
        response.mustcontain(' 3 Entries')
 

	
 
    def test_filter_journal_filter_wildcard_on_repository(self):
 
        self.log_user()
 
        response = self.app.get(url(controller='admin/admin', action='index',
 
                                    filter='repository:*test*'))
 
        response.mustcontain(' 862 Entries')
 

	
 
    def test_filter_journal_filter_prefix_on_repository(self):
 
        self.log_user()
 
        response = self.app.get(url(controller='admin/admin', action='index',
 
                                    filter='repository:test*'))
 
        response.mustcontain(' 257 Entries')
 

	
 
    def test_filter_journal_filter_prefix_on_repository_CamelCase(self):
 
        self.log_user()
 
        response = self.app.get(url(controller='admin/admin', action='index',
 
                                    filter='repository:Test*'))
 
        response.mustcontain(' 257 Entries')
 

	
 
    def test_filter_journal_filter_prefix_on_repository_and_user(self):
 
        self.log_user()
 
        response = self.app.get(url(controller='admin/admin', action='index',
 
                                    filter='repository:test* AND username:demo'))
 
        response.mustcontain(' 130 Entries')
 

	
 
    def test_filter_journal_filter_prefix_on_repository_or_other_repo(self):
 
        self.log_user()
 
        response = self.app.get(url(controller='admin/admin', action='index',
 
                                    filter='repository:test* OR repository:xxx'))
 
        response.mustcontain(' 260 Entries')  # 257 + 3
 

	
 
    def test_filter_journal_filter_exact_match_on_username(self):
 
        self.log_user()
 
        response = self.app.get(url(controller='admin/admin', action='index',
 
                                    filter='username:demo'))
 
        response.mustcontain(' 1087 Entries')
 

	
 
    def test_filter_journal_filter_exact_match_on_username_camelCase(self):
 
        self.log_user()
 
        response = self.app.get(url(controller='admin/admin', action='index',
 
                                    filter='username:DemO'))
 
        response.mustcontain(' 1087 Entries')
 

	
 
    def test_filter_journal_filter_wildcard_on_username(self):
 
        self.log_user()
 
        response = self.app.get(url(controller='admin/admin', action='index',
 
                                    filter='username:*test*'))
 
        response.mustcontain(' 100 Entries')
 

	
 
    def test_filter_journal_filter_prefix_on_username(self):
 
        self.log_user()
 
        response = self.app.get(url(controller='admin/admin', action='index',
 
                                    filter='username:demo*'))
 
        response.mustcontain(' 1101 Entries')
 

	
 
    def test_filter_journal_filter_prefix_on_user_or_other_user(self):
 
        self.log_user()
 
        response = self.app.get(url(controller='admin/admin', action='index',
 
                                    filter='username:demo OR username:volcan'))
 
        response.mustcontain(' 1095 Entries')  # 1087 + 8
 

	
 
    def test_filter_journal_filter_wildcard_on_action(self):
 
        self.log_user()
 
        response = self.app.get(url(controller='admin/admin', action='index',
 
                                    filter='action:*pull_request*'))
 
        response.mustcontain(' 187 Entries')
 

	
 
    def test_filter_journal_filter_on_date(self):
 
        self.log_user()
 
        response = self.app.get(url(controller='admin/admin', action='index',
 
                                    filter='date:20121010'))
 
        response.mustcontain(' 47 Entries')
 

	
 
    def test_filter_journal_filter_on_date_2(self):
 
        self.log_user()
 
        response = self.app.get(url(controller='admin/admin', action='index',
 
                                    filter='date:20121020'))
 
        response.mustcontain(' 17 Entries')
 

	
 
    @parametrize('filter,hit', [
 
        #### "repository:" filtering
 
        # "/" is used for grouping
 
        ('repository:group/test', 4),
 
        # "-" is often used for "-fork"
 
        ('repository:fork-test1', 5),
 
        # using "stop words"
 
        ('repository:this', 1),
 
        ('repository:this/is-it', 1),
 

	
 
        ## additional tests to quickly find out regression in the future
 
        ## (and check case-insensitive search, too)
 
        # non-ascii character "." and "-"
 
        ('repository:TESTIES1.2.3', 4),
 
        ('repository:test_git_repo', 2),
 
        # combination with wildcard "*"
 
        ('repository:GROUP/*', 182),
 
        ('repository:*/test', 7),
 
        ('repository:fork-*', 273),
 
        ('repository:*-Test1', 5),
 

	
 
        #### "username:" filtering
 
        # "-" is valid character
 
        ('username:peso-xxx', 0),
 
        ('username:peso-xxx', 4),
 
        # using "stop words"
 
        ('username:this-is-it', 2036),
 
        ('username:this-is-it', 2),
 

	
 
        ## additional tests to quickly find out regression in the future
 
        ## (and check case-insensitive search, too)
 
        # non-ascii character "." and "-"
 
        ('username:ADMIN_xanroot', 6),
 
        ('username:robert.Zaremba', 3),
 
        # combination with wildcard "*"
 
        ('username:THIS-*', 2),
 
        ('username:*-IT', 2),
 
    ])
 
    def test_filter_journal_filter_tokenization(self, filter, hit):
 
        self.log_user()
 

	
 
        response = self.app.get(url(controller='admin/admin', action='index',
 
                                    filter=filter))
 
        if hit != 1:
 
            response.mustcontain(' %s Entries' % hit)
 
        else:
 
            response.mustcontain(' 1 Entry')
0 comments (0 inline, 0 general)