Files
@ caef0be39948
Branch filter:
Location: kallithea/scripts/generate-ini.py
caef0be39948
6.7 KiB
text/x-python
search: make "repository:" condition work as expected
Before this revision, "repository:foo" condition at searching for
"File contents" or "File names" shows files in repositories below.
- foo
- foo/bar
- foo-bar
- and so on ...
Whoosh library, which is used to parse text for indexing and seaching,
does:
- treat almost all non-alphanumeric characters as delimiter both at
indexing search items and at parsing search condition
- make each fields for a search item be indexed by multiple values
For example, files in "foo/bar" repository are indexed by "foo" and
"bar" in "repository" field. This tokenization make "repository:foo"
search condition match against files in "foo/bar" repository, too.
In addition to it, using plain TEXT also causes unintentional
ignorance of "stop words" in search conditions. For example, "this",
"a", "you", and so on are ignored at indexing and parsing, because
these are too generic words (from point of view of generic "text
search").
This issue can't be resolved by using ID instead of TEXT for
"repository" of SCHEMA, like as previous revisions for JOURNAL_SCHEMA,
because:
- highlight-ing file content requires SCHEMA to support "positions"
feature, but using ID instead of TEXT disables it
- using ID violates current case-insensitive search policy, because
it preserves case of text
To make "repository:" condition work as expected, this revision
explicitly specifies "analyzer", which does:
- avoid tokenization
- match case-insensitively
- avoid removing "stop words" from text
This revision requires full re-building index tables, because indexing
schema is changed.
BTW, "repository:" condition at searching for "Commit messages" uses
CHGSETS_SCHEMA instead of SCHEMA. The former uses ID for "repository",
and it does:
- avoid issues by tokenization and removing "stop words"
- disable "positions" feature of CHGSETS_SCHEMA
But highlight-ing file content isn't needed at searching for
"Commit messages". Therefore, this can be ignored.
- preserve case of text
This violates current case-insensitive search policy, This issue
will be fixed by subsequent revision, because fixing it isn't so
simple.
Before this revision, "repository:foo" condition at searching for
"File contents" or "File names" shows files in repositories below.
- foo
- foo/bar
- foo-bar
- and so on ...
Whoosh library, which is used to parse text for indexing and seaching,
does:
- treat almost all non-alphanumeric characters as delimiter both at
indexing search items and at parsing search condition
- make each fields for a search item be indexed by multiple values
For example, files in "foo/bar" repository are indexed by "foo" and
"bar" in "repository" field. This tokenization make "repository:foo"
search condition match against files in "foo/bar" repository, too.
In addition to it, using plain TEXT also causes unintentional
ignorance of "stop words" in search conditions. For example, "this",
"a", "you", and so on are ignored at indexing and parsing, because
these are too generic words (from point of view of generic "text
search").
This issue can't be resolved by using ID instead of TEXT for
"repository" of SCHEMA, like as previous revisions for JOURNAL_SCHEMA,
because:
- highlight-ing file content requires SCHEMA to support "positions"
feature, but using ID instead of TEXT disables it
- using ID violates current case-insensitive search policy, because
it preserves case of text
To make "repository:" condition work as expected, this revision
explicitly specifies "analyzer", which does:
- avoid tokenization
- match case-insensitively
- avoid removing "stop words" from text
This revision requires full re-building index tables, because indexing
schema is changed.
BTW, "repository:" condition at searching for "Commit messages" uses
CHGSETS_SCHEMA instead of SCHEMA. The former uses ID for "repository",
and it does:
- avoid issues by tokenization and removing "stop words"
- disable "positions" feature of CHGSETS_SCHEMA
But highlight-ing file content isn't needed at searching for
"Commit messages". Therefore, this can be ignored.
- preserve case of text
This violates current case-insensitive search policy, This issue
will be fixed by subsequent revision, because fixing it isn't so
simple.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 | #!/usr/bin/env python2
"""
Based on kallithea/bin/template.ini.mako, generate
kallithea/config/deployment.ini_tmpl
development.ini
kallithea/tests/test.ini
"""
import re
makofile = 'kallithea/bin/template.ini.mako'
# the mako conditionals used in all other ini files and templates
selected_mako_conditionals = set([
"database_engine == 'sqlite'",
"http_server == 'waitress'",
"error_aggregation_service == 'appenlight'",
"error_aggregation_service == 'sentry'",
])
# the mako variables used in all other ini files and templates
mako_variable_values = {
'host': '127.0.0.1',
'port': '5000',
'here': '%(here)s',
'uuid()': '${app_instance_uuid}',
}
# files to be generated from the mako template
ini_files = [
('kallithea/config/deployment.ini_tmpl',
'''
Kallithea - Example config
The %(here)s variable will be replaced with the parent directory of this file
''',
{}, # exactly the same settings as template.ini.mako
),
('kallithea/tests/test.ini',
'''
Kallithea - config for tests:
initial_repo_scan = true
sqlalchemy and kallithea_test.sqlite
custom logging
The %(here)s variable will be replaced with the parent directory of this file
''',
{
'[server:main]': {
'port': '4999',
},
'[app:main]': {
'initial_repo_scan': 'true',
'app_instance_uuid': 'test',
'show_revision_number': 'true',
'beaker.cache.sql_cache_short.expire': '1',
'beaker.session.secret': '{74e0cd75-b339-478b-b129-07dd221def1f}',
'cache_dir': '%(here)s/../../data/test/cache',
'index_dir': '%(here)s/../../data/test/index',
'archive_cache_dir': '%(here)s/../../data/test/tarballcache',
'beaker.cache.data_dir': '%(here)s/../../data/test/cache/data',
'beaker.cache.lock_dir': '%(here)s/../../data/test/cache/lock',
'sqlalchemy.url': 'sqlite:///%(here)s/kallithea_test.sqlite',
},
'[handler_console]': {
'level': 'DEBUG',
'formatter': 'color_formatter',
},
# The 'handler_console_sql' block is very similar to the one in
# development.ini, but without the explicit 'level=DEBUG' setting:
# it causes duplicate sqlalchemy debug logs, one through
# handler_console_sql and another through another path.
'[handler_console_sql]': {
'formatter': 'color_formatter_sql',
},
},
),
('development.ini',
'''
Kallithea - Development config:
listening on *:5000
sqlite and kallithea.db
initial_repo_scan = true
set debug = true
verbose and colorful logging
The %(here)s variable will be replaced with the parent directory of this file
''',
{
'[server:main]': {
'host': '0.0.0.0',
},
'[app:main]': {
'initial_repo_scan': 'true',
'set debug': 'true',
'app_instance_uuid': 'development-not-secret',
'beaker.session.secret': 'development-not-secret',
},
'[handler_console]': {
'level': 'DEBUG',
'formatter': 'color_formatter',
},
'[handler_console_sql]': {
'level': 'DEBUG',
'formatter': 'color_formatter_sql',
},
},
),
]
def main():
# make sure all mako lines starting with '#' (the '##' comments) are marked up as <text>
print 'reading:', makofile
mako_org = file(makofile).read()
mako_no_text_markup = re.sub(r'</?%text>', '', mako_org)
mako_marked_up = re.sub(r'\n(##.*)', r'\n<%text>\1</%text>', mako_no_text_markup, flags=re.MULTILINE)
if mako_marked_up != mako_org:
print 'writing:', makofile
file(makofile, 'w').write(mako_marked_up)
# select the right mako conditionals for the other less sophisticated formats
def sub_conditionals(m):
"""given a %if...%endif match, replace with just the selected
conditional sections enabled and the rest as comments
"""
conditional_lines = m.group(1)
def sub_conditional(m):
"""given a conditional and the corresponding lines, return them raw
or commented out, based on whether conditional is selected
"""
criteria, lines = m.groups()
if criteria not in selected_mako_conditionals:
lines = '\n'.join((l if not l or l.startswith('#') else '#' + l) for l in lines.split('\n'))
return lines
conditional_lines = re.sub(r'^%(?:el)?if (.*):\n((?:^[^%\n].*\n|\n)*)',
sub_conditional, conditional_lines, flags=re.MULTILINE)
return conditional_lines
mako_no_conditionals = re.sub(r'^(%if .*\n(?:[^%\n].*\n|%elif .*\n|\n)*)%endif\n',
sub_conditionals, mako_no_text_markup, flags=re.MULTILINE)
# expand mako variables
def pyrepl(m):
return mako_variable_values.get(m.group(1), m.group(0))
mako_no_variables = re.sub(r'\${([^}]*)}', pyrepl, mako_no_conditionals)
# remove utf-8 coding header
base_ini = re.sub(r'^## -\*- coding: utf-8 -\*-\n', '', mako_no_variables)
# create ini files
for fn, desc, settings in ini_files:
print 'updating:', fn
ini_lines = re.sub(
'# Kallithea - config file generated with kallithea-config *#\n',
''.join('# %-77s#\n' % l.strip() for l in desc.strip().split('\n')),
base_ini)
def process_section(m):
"""process a ini section, replacing values as necessary"""
sectionname, lines = m.groups()
if sectionname in settings:
section_settings = settings[sectionname]
def process_line(m):
"""process a section line and update value if necessary"""
setting, value = m.groups()
line = m.group(0)
if setting in section_settings:
line = '%s = %s' % (setting, section_settings[setting])
if '$' not in value:
line = '#%s = %s\n%s' % (setting, value, line)
return line.rstrip()
lines = re.sub(r'^([^#\n].*) = ?(.*)', process_line, lines, flags=re.MULTILINE)
return sectionname + '\n' + lines
ini_lines = re.sub(r'^(\[.*\])\n((?:(?:[^[\n].*)?\n)*)', process_section, ini_lines, flags=re.MULTILINE)
file(fn, 'w').write(ini_lines)
if __name__ == '__main__':
main()
|