kallithea Changeset - 2ff913970025

Changeset - 2ff913970025

Parent rev.

Child rev.

[Not reviewed]

default

0 2 0

FUJIWARA Katsunori - 9 years ago 2017-01-22 18:17:38
foozy@lares.dti.ne.jp

journal: make "username:" filtering condition work as expected

As described in previous revision, using TEXT in JOURNAL_SCHEMA causes
unexpected results for "username:", too.

- tokenization by non-alphanumeric characters
- removing "stop words"

To make "username:" filtering condition work as expected, this
revision uses ID instead of TEST for "username" of JOURNAL_COLUMN.

2 files changed with 3 insertions and 3 deletions:

kallithea/lib/indexers/__init__.py

kallithea/tests/functional/test_admin.py

0 comments (0 inline, 0 general)

kallithea/lib/indexers/__init__.py

➞

Show inline comments

 # -*- coding: utf-8 -*-
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
 # the Free Software Foundation, either version 3 of the License, or
 # (at your option) any later version.
+#
 # This program is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU General Public License for more details.
+#
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 """
 kallithea.lib.indexers
 ~~~~~~~~~~~~~~~~~~~~~~
 Whoosh indexing module for Kallithea
 This file was forked by the Kallithea project in July 2014.
 Original author and date, and relevant copyright and licensing information is below:
 :created_on: Aug 17, 2010
 :author: marcink
 :copyright: (c) 2013 RhodeCode GmbH, and others.
 :license: GPLv3, see LICENSE.md for more details.
 """
 import os
 import sys
 import logging
 from os.path import dirname
 # Add location of top level folder to sys.path
 sys.path.append(dirname(dirname(dirname(os.path.realpath(__file__)))))
 from whoosh.analysis import RegexTokenizer, LowercaseFilter
 from whoosh.fields import TEXT, ID, STORED, NUMERIC, BOOLEAN, Schema, FieldType, DATETIME
 from whoosh.formats import Characters
 from whoosh.highlight import highlight as whoosh_highlight, HtmlFormatter, ContextFragmenter
 from kallithea.lib.utils2 import LazyProperty
 log = logging.getLogger(__name__)
 # CUSTOM ANALYZER wordsplit + lowercase filter
 ANALYZER = RegexTokenizer(expression=r"\w+") | LowercaseFilter()
 #INDEX SCHEMA DEFINITION
 SCHEMA = Schema(
     fileid=ID(unique=True),
     owner=TEXT(),
     repository=TEXT(stored=True),
     path=TEXT(stored=True),
     content=FieldType(format=Characters(), analyzer=ANALYZER,
                       scorable=True, stored=True),
     modtime=STORED(),
     extension=TEXT(stored=True)
+)
 IDX_NAME = 'HG_INDEX'
 FORMATTER = HtmlFormatter('span', between='\n<span class="break">...</span>\n')
 FRAGMENTER = ContextFragmenter(200)
 CHGSETS_SCHEMA = Schema(
     raw_id=ID(unique=True, stored=True),
     date=NUMERIC(stored=True),
     last=BOOLEAN(),
     owner=TEXT(),
     repository=ID(unique=True, stored=True),
     author=TEXT(stored=True),
     message=FieldType(format=Characters(), analyzer=ANALYZER,
                       scorable=True, stored=True),
     parents=TEXT(),
     added=TEXT(),
     removed=TEXT(),
     changed=TEXT(),
+)
 CHGSET_IDX_NAME = 'CHGSET_INDEX'
 # used only to generate queries in journal
 JOURNAL_SCHEMA = Schema(
-    username=TEXT(),
+    username=ID(),
     date=DATETIME(),
     action=TEXT(),
     repository=ID(),
     ip=TEXT(),
+)
 class WhooshResultWrapper(object):
     def __init__(self, search_type, searcher, matcher, highlight_items,
                  repo_location):
         self.search_type = search_type
         self.searcher = searcher
         self.matcher = matcher
         self.highlight_items = highlight_items
         self.fragment_size = 200
         self.repo_location = repo_location
     @LazyProperty
     def doc_ids(self):
         docs_id = []
         while self.matcher.is_active():
             docnum = self.matcher.id()
             chunks = [offsets for offsets in self.get_chunks()]
             docs_id.append([docnum, chunks])
             self.matcher.next()
         return docs_id
     def __str__(self):
         return '<%s at %s>' % (self.__class__.__name__, len(self.doc_ids))
     def __repr__(self):
         return self.__str__()
     def __len__(self):
         return len(self.doc_ids)
     def __iter__(self):
         """
         Allows Iteration over results,and lazy generate content
         *Requires* implementation of ``__getitem__`` method.
         """
         for docid in self.doc_ids:
             yield self.get_full_content(docid)
     def __getitem__(self, key):
         """
         Slicing of resultWrapper
         """
         i, j = key.start, key.stop
         slices = []
         for docid in self.doc_ids[i:j]:
             slices.append(self.get_full_content(docid))
         return slices
     def get_full_content(self, docid):
         res = self.searcher.stored_fields(docid[0])
         log.debug('result: %s', res)
         if self.search_type == 'content':
             full_repo_path = os.path.join(self.repo_location, res['repository'])
             f_path = res['path'].split(full_repo_path)[-1]
             f_path = f_path.lstrip(os.sep)
             content_short = self.get_short_content(res, docid[1])
             res.update({'content_short': content_short,
                         'content_short_hl': self.highlight(content_short),
                         'f_path': f_path
             })
         elif self.search_type == 'path':
             full_repo_path = os.path.join(self.repo_location, res['repository'])
             f_path = res['path'].split(full_repo_path)[-1]
             f_path = f_path.lstrip(os.sep)
             res.update({'f_path': f_path})
         elif self.search_type == 'message':
             res.update({'message_hl': self.highlight(res['message'])})
         log.debug('result: %s', res)
         return res
     def get_short_content(self, res, chunks):
         return ''.join([res['content'][chunk[0]:chunk[1]] for chunk in chunks])
     def get_chunks(self):
         """
         Smart function that implements chunking the content
         but not overlap chunks so it doesn't highlight the same
         close occurrences twice.
         """
         memory = [(0, 0)]
         if self.matcher.supports('positions'):
             for span in self.matcher.spans():
                 start = span.startchar or 0
                 end = span.endchar or 0
                 start_offseted = max(0, start - self.fragment_size)
                 end_offseted = end + self.fragment_size
                 if start_offseted < memory[-1][1]:
                     start_offseted = memory[-1][1]
                 memory.append((start_offseted, end_offseted,))
                 yield (start_offseted, end_offseted,)
     def highlight(self, content, top=5):
         if self.search_type not in ['content', 'message']:
             return ''
         hl = whoosh_highlight(
             text=content,
             terms=self.highlight_items,
             analyzer=ANALYZER,
             fragmenter=FRAGMENTER,
             formatter=FORMATTER,
             top=top
+        )
         return hl

kallithea/tests/functional/test_admin.py

➞

Show inline comments

 import os
 import csv
 import datetime
 from kallithea.tests.base import *
 from kallithea.model.db import UserLog
 from kallithea.model.meta import Session
 from kallithea.lib.utils2 import safe_unicode
 from os.path import dirname
 FIXTURES = os.path.join(dirname(dirname(os.path.abspath(__file__))), 'fixtures')
 class TestAdminController(TestController):
     @classmethod
     def setup_class(cls):
         UserLog.query().delete()
         Session().commit()
         def strptime(val):
             fmt = '%Y-%m-%d %H:%M:%S'
             if '.' not in val:
                 return datetime.datetime.strptime(val, fmt)
             nofrag, frag = val.split(".")
             date = datetime.datetime.strptime(nofrag, fmt)
             frag = frag[:6]  # truncate to microseconds
             frag += (6 - len(frag)) * '0'  # add 0s
             return date.replace(microsecond=int(frag))
         with open(os.path.join(FIXTURES, 'journal_dump.csv')) as f:
             for row in csv.DictReader(f):
                 ul = UserLog()
                 for k, v in row.iteritems():
                     v = safe_unicode(v)
                     if k == 'action_date':
                         v = strptime(v)
                     if k in ['user_id', 'repository_id']:
                         # nullable due to FK problems
                         v = None
                     setattr(ul, k, v)
                 Session().add(ul)
             Session().commit()
     @classmethod
     def teardown_class(cls):
         UserLog.query().delete()
         Session().commit()
     def test_index(self):
         self.log_user()
         response = self.app.get(url(controller='admin/admin', action='index'))
         response.mustcontain('Admin Journal')
     def test_filter_all_entries(self):
         self.log_user()
         response = self.app.get(url(controller='admin/admin', action='index',))
         response.mustcontain(' 2036 Entries')
     def test_filter_journal_filter_exact_match_on_repository(self):
         self.log_user()
         response = self.app.get(url(controller='admin/admin', action='index',
                                     filter='repository:xxx'))
         response.mustcontain(' 3 Entries')
     def test_filter_journal_filter_exact_match_on_repository_CamelCase(self):
         self.log_user()
         response = self.app.get(url(controller='admin/admin', action='index',
                                     filter='repository:XxX'))
         response.mustcontain(' 3 Entries')
     def test_filter_journal_filter_wildcard_on_repository(self):
         self.log_user()
         response = self.app.get(url(controller='admin/admin', action='index',
                                     filter='repository:*test*'))
         response.mustcontain(' 862 Entries')
     def test_filter_journal_filter_prefix_on_repository(self):
         self.log_user()
         response = self.app.get(url(controller='admin/admin', action='index',
                                     filter='repository:test*'))
         response.mustcontain(' 257 Entries')
     def test_filter_journal_filter_prefix_on_repository_CamelCase(self):
         self.log_user()
         response = self.app.get(url(controller='admin/admin', action='index',
                                     filter='repository:Test*'))
         response.mustcontain(' 257 Entries')
     def test_filter_journal_filter_prefix_on_repository_and_user(self):
         self.log_user()
         response = self.app.get(url(controller='admin/admin', action='index',
                                     filter='repository:test* AND username:demo'))
         response.mustcontain(' 130 Entries')
     def test_filter_journal_filter_prefix_on_repository_or_other_repo(self):
         self.log_user()
         response = self.app.get(url(controller='admin/admin', action='index',
                                     filter='repository:test* OR repository:xxx'))
         response.mustcontain(' 260 Entries')  # 257 + 3
     def test_filter_journal_filter_exact_match_on_username(self):
         self.log_user()
         response = self.app.get(url(controller='admin/admin', action='index',
                                     filter='username:demo'))
         response.mustcontain(' 1087 Entries')
     def test_filter_journal_filter_exact_match_on_username_camelCase(self):
         self.log_user()
         response = self.app.get(url(controller='admin/admin', action='index',
                                     filter='username:DemO'))
         response.mustcontain(' 1087 Entries')
     def test_filter_journal_filter_wildcard_on_username(self):
         self.log_user()
         response = self.app.get(url(controller='admin/admin', action='index',
                                     filter='username:*test*'))
         response.mustcontain(' 100 Entries')
     def test_filter_journal_filter_prefix_on_username(self):
         self.log_user()
         response = self.app.get(url(controller='admin/admin', action='index',
                                     filter='username:demo*'))
         response.mustcontain(' 1101 Entries')
     def test_filter_journal_filter_prefix_on_user_or_other_user(self):
         self.log_user()
         response = self.app.get(url(controller='admin/admin', action='index',
                                     filter='username:demo OR username:volcan'))
         response.mustcontain(' 1095 Entries')  # 1087 + 8
     def test_filter_journal_filter_wildcard_on_action(self):
         self.log_user()
         response = self.app.get(url(controller='admin/admin', action='index',
                                     filter='action:*pull_request*'))
         response.mustcontain(' 187 Entries')
     def test_filter_journal_filter_on_date(self):
         self.log_user()
         response = self.app.get(url(controller='admin/admin', action='index',
                                     filter='date:20121010'))
         response.mustcontain(' 47 Entries')
     def test_filter_journal_filter_on_date_2(self):
         self.log_user()
         response = self.app.get(url(controller='admin/admin', action='index',
                                     filter='date:20121020'))
         response.mustcontain(' 17 Entries')
     @parametrize('filter,hit', [
         #### "repository:" filtering
         # "/" is used for grouping
         ('repository:group/test', 4),
         # "-" is often used for "-fork"
         ('repository:fork-test1', 5),
         # using "stop words"
         ('repository:this', 1),
         ('repository:this/is-it', 1),
         ## additional tests to quickly find out regression in the future
         ## (and check case-insensitive search, too)
         # non-ascii character "." and "-"
         ('repository:TESTIES1.2.3', 4),
         ('repository:test_git_repo', 2),
         # combination with wildcard "*"
         ('repository:GROUP/*', 182),
         ('repository:*/test', 7),
         ('repository:fork-*', 273),
         ('repository:*-Test1', 5),
         #### "username:" filtering
         # "-" is valid character
-        ('username:peso-xxx', 0),
+        ('username:peso-xxx', 4),
         # using "stop words"
-        ('username:this-is-it', 2036),
+        ('username:this-is-it', 2),
         ## additional tests to quickly find out regression in the future
         ## (and check case-insensitive search, too)
         # non-ascii character "." and "-"
         ('username:ADMIN_xanroot', 6),
         ('username:robert.Zaremba', 3),
         # combination with wildcard "*"
         ('username:THIS-*', 2),
         ('username:*-IT', 2),
     ])
     def test_filter_journal_filter_tokenization(self, filter, hit):
         self.log_user()
         response = self.app.get(url(controller='admin/admin', action='index',
                                     filter=filter))
         if hit != 1:
             response.mustcontain(' %s Entries' % hit)
         else:
             response.mustcontain(' 1 Entry')

0 comments (0 inline, 0 general)