# HG changeset patch # User Mads Kiilerich # Date 2019-12-27 23:30:56 # Node ID 0f69b5c35b2bfe83bc38585a30709f29a42a1752 # Parent 4e565c5d7b7d9835a30f46bf2873fc765c5bb645 lib: introduce string conversion functions for ASCII without further encoding concerns Avoid the trial-and-error and vagueness of the "safe" functions. This should replace some use of safe_unicode and safe_str. It will mostly be a noop in py2 but will be crucial in py3. diff --git a/kallithea/lib/utils2.py b/kallithea/lib/utils2.py --- a/kallithea/lib/utils2.py +++ b/kallithea/lib/utils2.py @@ -43,7 +43,7 @@ from tg.i18n import ungettext from webhelpers2.text import collapse, remove_formatting, strip_tags from kallithea.lib.compat import json -from kallithea.lib.vcs.utils import safe_bytes, safe_str, safe_unicode # re-export +from kallithea.lib.vcs.utils import ascii_bytes, ascii_str, safe_bytes, safe_str, safe_unicode # re-export from kallithea.lib.vcs.utils.lazy import LazyProperty diff --git a/kallithea/lib/vcs/utils/__init__.py b/kallithea/lib/vcs/utils/__init__.py --- a/kallithea/lib/vcs/utils/__init__.py +++ b/kallithea/lib/vcs/utils/__init__.py @@ -1,3 +1,5 @@ +# -*- coding: utf-8 -*- + """ This module provides some useful tools for ``vcs`` like annotate/diff html output. It also includes some internal helpers. @@ -121,6 +123,60 @@ def safe_bytes(s): safe_str = safe_bytes # safe_str is deprecated - it will be redefined when changing to py3 +def ascii_bytes(s): + """ + Simple conversion from unicode/str to bytes, *assuming* all codepoints are + 7-bit and it thus is pure ASCII. + Will fail badly with UnicodeError on invalid input. + This should be used where enocding and "safe" ambiguity should be avoided. + Where strings already have been encoded in other ways but still are unicode + string - for example to hex, base64, json, urlencoding, or are known to be + identifiers. + + >>> ascii_bytes('a') + 'a' + >>> ascii_bytes(u'a') + 'a' + >>> ascii_bytes('å') + Traceback (most recent call last): + UnicodeDecodeError: 'ascii' codec can't decode byte 0xc3 in position 0: ordinal not in range(128) + >>> ascii_bytes(u'å') + Traceback (most recent call last): + UnicodeEncodeError: 'ascii' codec can't encode characters in position 0-1: ordinal not in range(128) + """ + assert isinstance(s, (unicode, str)), repr(s) + return s.encode('ascii') + + +def ascii_str(s): + r""" + Simple conversion from bytes to str, *assuming* all codepoints are + 7-bit and it thus is pure ASCII. + Will fail badly with UnicodeError on invalid input. + This should be used where enocding and "safe" ambiguity should be avoided. + Where strings are encoded but also in other ways are known to be ASCII, and + where a unicode string is wanted without caring about encoding. For example + to hex, base64, urlencoding, or are known to be identifiers. + + >>> ascii_str('a') + 'a' + >>> ascii_str(u'a') + Traceback (most recent call last): + AssertionError: u'a' + >>> ascii_str('å') + Traceback (most recent call last): + UnicodeDecodeError: 'ascii' codec can't decode byte 0xc3 in position 0: ordinal not in range(128) + >>> ascii_str(u'å') + Traceback (most recent call last): + AssertionError: u'\xc3\xa5' + """ + assert isinstance(s, bytes), repr(s) + # Note: we use "encode", even though we really *should* use "decode". But + # we are in py2 and don't want py2, and encode is doing what we need for the + # ascii subset. + return s.encode('ascii') + + # Regex taken from http://www.regular-expressions.info/email.html email_re = re.compile( r"""[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*@"""