Files
@ f734d107296e
Branch filter:
Location: kallithea/scripts/update-copyrights.py
f734d107296e
6.4 KiB
text/x-python
auth: for default permissions, use existing explicit query result values instead of following dot references in ORM result objects
There has been reports of spurious crashes on resolving references like
.repository from Permissions:
File ".../kallithea/lib/auth.py", line 678, in __wrapper
if self.check_permissions(user):
File ".../kallithea/lib/auth.py", line 718, in check_permissions
return user.has_repository_permission_level(repo_name, self.required_perm)
File ".../kallithea/lib/auth.py", line 450, in has_repository_permission_level
actual_perm = self.permissions['repositories'].get(repo_name)
File ".../kallithea/lib/vcs/utils/lazy.py", line 41, in __get__
value = self._func(obj)
File ".../kallithea/lib/auth.py", line 442, in permissions
return self.__get_perms(user=self, cache=False)
File ".../kallithea/lib/auth.py", line 498, in __get_perms
return compute(user_id, user_is_admin)
File ".../kallithea/lib/auth.py", line 190, in _cached_perms_data
r_k = perm.UserRepoToPerm.repository.repo_name
File ".../sqlalchemy/orm/attributes.py", line 285, in __get__
return self.impl.get(instance_state(instance), dict_)
File ".../sqlalchemy/orm/attributes.py", line 721, in get
value = self.callable_(state, passive)
File ".../sqlalchemy/orm/strategies.py", line 710, in _load_for_state
% (orm_util.state_str(state), self.key)
sqlalchemy.orm.exc.DetachedInstanceError: Parent instance <UserRepoToPerm at ...> is not bound to a Session; lazy load operation of attribute 'repository' cannot proceed (Background on this error at: http://sqlalche.me/e/bhk3)
Permissions are cached between requests: SA result records are stored in in
beaker.cache.sql_cache_short and resued in following requests after the initial
session as been removed. References in Permission objects would usually give
lazy lookup ... but not outside the original session, where we would get an
error like this.
Permissions are indeed implemented/used incorrectly. That might explain a part
of the problem. Even if not fully explaining or fixing this problem, it is
still worth fixing:
Permissions are fetched from the database using Session().query with multiple
class/table names (joined together in way that happens to match the references
specified in the table definitions) - including Repository. The results are
thus "structs" with selected objects. If repositories always were retrieved
using this selected repository, everything would be fine. In some places, this
was what we did.
But in some places, the code happened to do what was more intuitive: just use
.repository and rely on "lazy" resolving. SA was not aware that this one
already was present in the result struct, and would try to fetch it again. Best
case, that could be inefficient. Worst case, it would fail as we see here.
Fix this by only querying from one table but use the "joinedload" option to
also fetch other referenced tables in the same select. (This might
inefficiently return the main record multiple times ... but that was already
the case with the previous approach.)
This change is thus doing multiple things with circular dependencies that can't
be split up in minor parts without taking detours:
The existing repository join like:
.join((Repository, UserGroupRepoToPerm.repository_id == Repository.repo_id))
is thus replaced by:
.options(joinedload(UserGroupRepoToPerm.repository))
Since we only are doing Session.query() on one table, the results will be of
that type instead of "structs" with multiple objects. If only querying for
UserRepoToPerm this means:
- perm.UserRepoToPerm.repository becomes perm.repository
- perm.Permission.permission_name looked at the explicitly queried Permission
in the result struct - instead it should look in the the dereferenced
repository as perm.permission.permission_name
There has been reports of spurious crashes on resolving references like
.repository from Permissions:
File ".../kallithea/lib/auth.py", line 678, in __wrapper
if self.check_permissions(user):
File ".../kallithea/lib/auth.py", line 718, in check_permissions
return user.has_repository_permission_level(repo_name, self.required_perm)
File ".../kallithea/lib/auth.py", line 450, in has_repository_permission_level
actual_perm = self.permissions['repositories'].get(repo_name)
File ".../kallithea/lib/vcs/utils/lazy.py", line 41, in __get__
value = self._func(obj)
File ".../kallithea/lib/auth.py", line 442, in permissions
return self.__get_perms(user=self, cache=False)
File ".../kallithea/lib/auth.py", line 498, in __get_perms
return compute(user_id, user_is_admin)
File ".../kallithea/lib/auth.py", line 190, in _cached_perms_data
r_k = perm.UserRepoToPerm.repository.repo_name
File ".../sqlalchemy/orm/attributes.py", line 285, in __get__
return self.impl.get(instance_state(instance), dict_)
File ".../sqlalchemy/orm/attributes.py", line 721, in get
value = self.callable_(state, passive)
File ".../sqlalchemy/orm/strategies.py", line 710, in _load_for_state
% (orm_util.state_str(state), self.key)
sqlalchemy.orm.exc.DetachedInstanceError: Parent instance <UserRepoToPerm at ...> is not bound to a Session; lazy load operation of attribute 'repository' cannot proceed (Background on this error at: http://sqlalche.me/e/bhk3)
Permissions are cached between requests: SA result records are stored in in
beaker.cache.sql_cache_short and resued in following requests after the initial
session as been removed. References in Permission objects would usually give
lazy lookup ... but not outside the original session, where we would get an
error like this.
Permissions are indeed implemented/used incorrectly. That might explain a part
of the problem. Even if not fully explaining or fixing this problem, it is
still worth fixing:
Permissions are fetched from the database using Session().query with multiple
class/table names (joined together in way that happens to match the references
specified in the table definitions) - including Repository. The results are
thus "structs" with selected objects. If repositories always were retrieved
using this selected repository, everything would be fine. In some places, this
was what we did.
But in some places, the code happened to do what was more intuitive: just use
.repository and rely on "lazy" resolving. SA was not aware that this one
already was present in the result struct, and would try to fetch it again. Best
case, that could be inefficient. Worst case, it would fail as we see here.
Fix this by only querying from one table but use the "joinedload" option to
also fetch other referenced tables in the same select. (This might
inefficiently return the main record multiple times ... but that was already
the case with the previous approach.)
This change is thus doing multiple things with circular dependencies that can't
be split up in minor parts without taking detours:
The existing repository join like:
.join((Repository, UserGroupRepoToPerm.repository_id == Repository.repo_id))
is thus replaced by:
.options(joinedload(UserGroupRepoToPerm.repository))
Since we only are doing Session.query() on one table, the results will be of
that type instead of "structs" with multiple objects. If only querying for
UserRepoToPerm this means:
- perm.UserRepoToPerm.repository becomes perm.repository
- perm.Permission.permission_name looked at the explicitly queried Permission
in the result struct - instead it should look in the the dereferenced
repository as perm.permission.permission_name
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 | #!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Kallithea script for maintaining contributor lists from version control
history.
This script and the data in it is a best effort attempt at reverse engineering
previous attributions and correlate that with version control history while
preserving all existing copyright statements and attribution. This script is
processing and summarizing information found elsewhere - it is not by itself
making any claims. Comments in the script are an attempt at reverse engineering
possible explanations - they are not showing any intent or confirming it is
correct.
Three files are generated / modified by this script:
kallithea/templates/about.html claims to show copyright holders, and the GPL
license requires such existing "legal notices" to be preserved. We also try to
keep it updated with copyright holders, but do not claim it is a correct list.
CONTRIBUTORS has the purpose of giving credit where credit is due and list all
the contributor names in the source.
kallithea/templates/base/base.html contains the copyright years in the page
footer.
Both make a best effort of listing all copyright holders, but revision control
history might be a better and more definitive source.
Contributors are sorted "fairly" by copyright year and amount of
contribution.
New contributors are listed, without considering if the contribution contains
copyrightable work.
When the copyright might belong to a different legal entity than the
contributor, the legal entity is given credit too.
"""
import os
import re
from collections import defaultdict
import contributor_data
def sortkey(x):
"""Return key for sorting contributors "fairly":
* latest contribution
* first contribution
* number of contribution years
* name (with some unicode normalization)
The entries must be 2-tuples of a list of string years and the name"""
years, name = x
if not years:
years = ['0']
return (-int(years[-1]), # primarily sort by latest contribution
int(years[0]), # then sort by first contribution
-len(years), # then sort by length of contribution (no gaps)
name.lower().replace('\xe9', 'e').replace('\u0142', 'l') # finally sort by name
)
def nice_years(l, dash='-', join=' '):
"""Convert a list of years into brief range like '1900-1901, 1921'."""
if not l:
return ''
start = end = int(l[0])
ranges = []
for year in l[1:] + [0]:
year = int(year)
if year == end + 1:
end = year
continue
if start == end:
ranges.append('%s' % start)
else:
ranges.append('%s%s%s' % (start, dash, end))
start = end = year
assert start == 0 and end == 0, (start, end)
return join.join(ranges)
def insert_entries(
filename,
all_entries,
no_entries,
domain_extra,
split_re,
normalize_name,
format_f):
"""Update file with contributor information.
all_entries: list of tuples with year and name
no_entries: set of names or name and year tuples to ignore
domain_extra: map domain name to extra credit name
split_re: regexp matching the part of file to rewrite
normalize_name: function to normalize names for grouping and display
format_f: function formatting year list and name to a string
"""
name_years = defaultdict(set)
for year, name in all_entries:
if name in no_entries or (name, year) in no_entries:
continue
parts = name.split(' <', 1)
if len(parts) == 2:
name = parts[0] + ' <' + parts[1].lower()
domain = name.split('@', 1)[-1].rstrip('>')
if domain in domain_extra:
name_years[domain_extra[domain]].add(year)
name_years[normalize_name(name)].add(year)
l = [(list(sorted(year for year in years if year)), name)
for name, years in name_years.items()]
l.sort(key=sortkey)
with open(filename) as f:
pre, post = re.split(split_re, f.read())
with open(filename, 'w') as f:
f.write(pre +
''.join(format_f(years, name) for years, name in l) +
post)
def main():
repo_entries = [
(year, contributor_data.name_fixes.get(name) or contributor_data.name_fixes.get(name.rsplit('<', 1)[0].strip()) or name)
for year, name in
(line.strip().split(' ', 1)
for line in os.popen("""hg log -r '::.' -T '{date(date,"%Y")} {author}\n'""").readlines())
]
insert_entries(
filename='kallithea/templates/about.html',
all_entries=repo_entries + contributor_data.other_about + contributor_data.other,
no_entries=contributor_data.no_about,
domain_extra=contributor_data.domain_extra,
split_re=r'(?: <li>Copyright © [^\n]+</li>\n)+',
normalize_name=lambda name: name.split('<', 1)[0].strip(),
format_f=lambda years, name: ' <li>Copyright © %s, %s</li>\n' % (nice_years(years, '–', ', '), name),
)
insert_entries(
filename='CONTRIBUTORS',
all_entries=repo_entries + contributor_data.other_contributors + contributor_data.other,
no_entries=contributor_data.total_ignore,
domain_extra=contributor_data.domain_extra,
split_re=r'(?: [^\n]+\n)+',
normalize_name=lambda name: name,
format_f=lambda years, name: (' %s%s%s\n' % (name, ' ' if years else '', nice_years(years))),
)
insert_entries(
filename='kallithea/templates/base/base.html',
all_entries=repo_entries,
no_entries=contributor_data.total_ignore,
domain_extra={},
split_re=r'(?<=©) .+ (?=by various authors)',
normalize_name=lambda name: '',
format_f=lambda years, name: ' ' + nice_years(years, '–', ', ') + ' ',
)
#docs/conf.py:copyright = u'2010-2016 by various authors, licensed as GPLv3.'
insert_entries(
filename='docs/conf.py',
all_entries=repo_entries,
no_entries=contributor_data.total_ignore,
domain_extra={},
split_re=r"(?<=copyright = ').+(?= by various authors)",
normalize_name=lambda name: '',
format_f=lambda years, name: nice_years(years, '-', ', '),
)
if __name__ == '__main__':
main()
# To list new contributors since last tagging:
# { hg log -r '::tagged()' -T ' {author}\n {author}\n'; hg log -r '::.' -T ' {author}\n' | sort | uniq; } | sort | uniq -u
|