Changeset - 2ff913970025
[Not reviewed]
default
0 2 0
FUJIWARA Katsunori - 9 years ago 2017-01-22 18:17:38
foozy@lares.dti.ne.jp
journal: make "username:" filtering condition work as expected

As described in previous revision, using TEXT in JOURNAL_SCHEMA causes
unexpected results for "username:", too.

- tokenization by non-alphanumeric characters
- removing "stop words"

To make "username:" filtering condition work as expected, this
revision uses ID instead of TEST for "username" of JOURNAL_COLUMN.
2 files changed with 3 insertions and 3 deletions:
0 comments (0 inline, 0 general)
kallithea/lib/indexers/__init__.py
Show inline comments
 
@@ -34,97 +34,97 @@ from os.path import dirname
 
sys.path.append(dirname(dirname(dirname(os.path.realpath(__file__)))))
 

	
 
from whoosh.analysis import RegexTokenizer, LowercaseFilter
 
from whoosh.fields import TEXT, ID, STORED, NUMERIC, BOOLEAN, Schema, FieldType, DATETIME
 
from whoosh.formats import Characters
 
from whoosh.highlight import highlight as whoosh_highlight, HtmlFormatter, ContextFragmenter
 
from kallithea.lib.utils2 import LazyProperty
 

	
 
log = logging.getLogger(__name__)
 

	
 
# CUSTOM ANALYZER wordsplit + lowercase filter
 
ANALYZER = RegexTokenizer(expression=r"\w+") | LowercaseFilter()
 

	
 
#INDEX SCHEMA DEFINITION
 
SCHEMA = Schema(
 
    fileid=ID(unique=True),
 
    owner=TEXT(),
 
    repository=TEXT(stored=True),
 
    path=TEXT(stored=True),
 
    content=FieldType(format=Characters(), analyzer=ANALYZER,
 
                      scorable=True, stored=True),
 
    modtime=STORED(),
 
    extension=TEXT(stored=True)
 
)
 

	
 
IDX_NAME = 'HG_INDEX'
 
FORMATTER = HtmlFormatter('span', between='\n<span class="break">...</span>\n')
 
FRAGMENTER = ContextFragmenter(200)
 

	
 
CHGSETS_SCHEMA = Schema(
 
    raw_id=ID(unique=True, stored=True),
 
    date=NUMERIC(stored=True),
 
    last=BOOLEAN(),
 
    owner=TEXT(),
 
    repository=ID(unique=True, stored=True),
 
    author=TEXT(stored=True),
 
    message=FieldType(format=Characters(), analyzer=ANALYZER,
 
                      scorable=True, stored=True),
 
    parents=TEXT(),
 
    added=TEXT(),
 
    removed=TEXT(),
 
    changed=TEXT(),
 
)
 

	
 
CHGSET_IDX_NAME = 'CHGSET_INDEX'
 

	
 
# used only to generate queries in journal
 
JOURNAL_SCHEMA = Schema(
 
    username=TEXT(),
 
    username=ID(),
 
    date=DATETIME(),
 
    action=TEXT(),
 
    repository=ID(),
 
    ip=TEXT(),
 
)
 

	
 

	
 
class WhooshResultWrapper(object):
 
    def __init__(self, search_type, searcher, matcher, highlight_items,
 
                 repo_location):
 
        self.search_type = search_type
 
        self.searcher = searcher
 
        self.matcher = matcher
 
        self.highlight_items = highlight_items
 
        self.fragment_size = 200
 
        self.repo_location = repo_location
 

	
 
    @LazyProperty
 
    def doc_ids(self):
 
        docs_id = []
 
        while self.matcher.is_active():
 
            docnum = self.matcher.id()
 
            chunks = [offsets for offsets in self.get_chunks()]
 
            docs_id.append([docnum, chunks])
 
            self.matcher.next()
 
        return docs_id
 

	
 
    def __str__(self):
 
        return '<%s at %s>' % (self.__class__.__name__, len(self.doc_ids))
 

	
 
    def __repr__(self):
 
        return self.__str__()
 

	
 
    def __len__(self):
 
        return len(self.doc_ids)
 

	
 
    def __iter__(self):
 
        """
 
        Allows Iteration over results,and lazy generate content
 

	
 
        *Requires* implementation of ``__getitem__`` method.
 
        """
 
        for docid in self.doc_ids:
 
            yield self.get_full_content(docid)
 

	
 
    def __getitem__(self, key):
 
        """
 
        Slicing of resultWrapper
kallithea/tests/functional/test_admin.py
Show inline comments
 
@@ -126,70 +126,70 @@ class TestAdminController(TestController
 

	
 
    def test_filter_journal_filter_prefix_on_user_or_other_user(self):
 
        self.log_user()
 
        response = self.app.get(url(controller='admin/admin', action='index',
 
                                    filter='username:demo OR username:volcan'))
 
        response.mustcontain(' 1095 Entries')  # 1087 + 8
 

	
 
    def test_filter_journal_filter_wildcard_on_action(self):
 
        self.log_user()
 
        response = self.app.get(url(controller='admin/admin', action='index',
 
                                    filter='action:*pull_request*'))
 
        response.mustcontain(' 187 Entries')
 

	
 
    def test_filter_journal_filter_on_date(self):
 
        self.log_user()
 
        response = self.app.get(url(controller='admin/admin', action='index',
 
                                    filter='date:20121010'))
 
        response.mustcontain(' 47 Entries')
 

	
 
    def test_filter_journal_filter_on_date_2(self):
 
        self.log_user()
 
        response = self.app.get(url(controller='admin/admin', action='index',
 
                                    filter='date:20121020'))
 
        response.mustcontain(' 17 Entries')
 

	
 
    @parametrize('filter,hit', [
 
        #### "repository:" filtering
 
        # "/" is used for grouping
 
        ('repository:group/test', 4),
 
        # "-" is often used for "-fork"
 
        ('repository:fork-test1', 5),
 
        # using "stop words"
 
        ('repository:this', 1),
 
        ('repository:this/is-it', 1),
 

	
 
        ## additional tests to quickly find out regression in the future
 
        ## (and check case-insensitive search, too)
 
        # non-ascii character "." and "-"
 
        ('repository:TESTIES1.2.3', 4),
 
        ('repository:test_git_repo', 2),
 
        # combination with wildcard "*"
 
        ('repository:GROUP/*', 182),
 
        ('repository:*/test', 7),
 
        ('repository:fork-*', 273),
 
        ('repository:*-Test1', 5),
 

	
 
        #### "username:" filtering
 
        # "-" is valid character
 
        ('username:peso-xxx', 0),
 
        ('username:peso-xxx', 4),
 
        # using "stop words"
 
        ('username:this-is-it', 2036),
 
        ('username:this-is-it', 2),
 

	
 
        ## additional tests to quickly find out regression in the future
 
        ## (and check case-insensitive search, too)
 
        # non-ascii character "." and "-"
 
        ('username:ADMIN_xanroot', 6),
 
        ('username:robert.Zaremba', 3),
 
        # combination with wildcard "*"
 
        ('username:THIS-*', 2),
 
        ('username:*-IT', 2),
 
    ])
 
    def test_filter_journal_filter_tokenization(self, filter, hit):
 
        self.log_user()
 

	
 
        response = self.app.get(url(controller='admin/admin', action='index',
 
                                    filter=filter))
 
        if hit != 1:
 
            response.mustcontain(' %s Entries' % hit)
 
        else:
 
            response.mustcontain(' 1 Entry')
0 comments (0 inline, 0 general)