kallithea Changeset - 0f69b5c35b2b

Changeset - 0f69b5c35b2b

Parent rev.

Child rev.

[Not reviewed]

default

0 2 0

Mads Kiilerich - 6 years ago 2019-12-27 23:30:56
mads@kiilerich.com

Grafted from: 57ed7fc231aa

lib: introduce string conversion functions for ASCII without further encoding concerns

Avoid the trial-and-error and vagueness of the "safe" functions.

This should replace some use of safe_unicode and safe_str. It will mostly be a
noop in py2 but will be crucial in py3.

2 files changed with 57 insertions and 1 deletions:

kallithea/lib/utils2.py

kallithea/lib/vcs/utils/__init__.py

0 comments (0 inline, 0 general)

kallithea/lib/utils2.py

➞

Show inline comments

@@ @@ -43,7 +43,7 @@ from tg.i18n import ungettext @@
 from webhelpers2.text import collapse, remove_formatting, strip_tags
 from kallithea.lib.compat import json
 from kallithea.lib.vcs.utils import safe_bytes, safe_str, safe_unicode  # re-export
+from kallithea.lib.vcs.utils import ascii_bytes, ascii_str, safe_bytes, safe_str, safe_unicode  # re-export
 from kallithea.lib.vcs.utils.lazy import LazyProperty

kallithea/lib/vcs/utils/__init__.py

➞

Show inline comments

 # -*- coding: utf-8 -*-
 """
 This module provides some useful tools for ``vcs`` like annotate/diff html
 output. It also includes some internal helpers.
@@ @@ -121,6 +123,60 @@ def safe_bytes(s): @@
 safe_str = safe_bytes  # safe_str is deprecated - it will be redefined when changing to py3
 def ascii_bytes(s):
     """
     Simple conversion from unicode/str to bytes, *assuming* all codepoints are
 -bit and it thus is pure ASCII.
     Will fail badly with UnicodeError on invalid input.
     This should be used where enocding and "safe" ambiguity should be avoided.
     Where strings already have been encoded in other ways but still are unicode
     string - for example to hex, base64, json, urlencoding, or are known to be
     identifiers.
     >>> ascii_bytes('a')
     'a'
     >>> ascii_bytes(u'a')
     'a'
     >>> ascii_bytes('å')
     Traceback (most recent call last):
     UnicodeDecodeError: 'ascii' codec can't decode byte 0xc3 in position 0: ordinal not in range(128)
     >>> ascii_bytes(u'å')
     Traceback (most recent call last):
     UnicodeEncodeError: 'ascii' codec can't encode characters in position 0-1: ordinal not in range(128)
     """
     assert isinstance(s, (unicode, str)), repr(s)
     return s.encode('ascii')
 def ascii_str(s):
     r"""
     Simple conversion from bytes to str, *assuming* all codepoints are
 -bit and it thus is pure ASCII.
     Will fail badly with UnicodeError on invalid input.
     This should be used where enocding and "safe" ambiguity should be avoided.
     Where strings are encoded but also in other ways are known to be ASCII, and
     where a unicode string is wanted without caring about encoding. For example
     to hex, base64, urlencoding, or are known to be identifiers.
     >>> ascii_str('a')
     'a'
     >>> ascii_str(u'a')
     Traceback (most recent call last):
     AssertionError: u'a'
     >>> ascii_str('å')
     Traceback (most recent call last):
     UnicodeDecodeError: 'ascii' codec can't decode byte 0xc3 in position 0: ordinal not in range(128)
     >>> ascii_str(u'å')
     Traceback (most recent call last):
     AssertionError: u'\xc3\xa5'
     """
     assert isinstance(s, bytes), repr(s)
     # Note: we use "encode", even though we really *should* use "decode". But
     # we are in py2 and don't want py2, and encode is doing what we need for the
     # ascii subset.
     return s.encode('ascii')
 # Regex taken from http://www.regular-expressions.info/email.html
 email_re = re.compile(
     r"""[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*@"""

0 comments (0 inline, 0 general)