# HG changeset patch
# User Mads Kiilerich <mads@kiilerich.com>
# Date 2019-12-27 23:30:56
# Node ID 0f69b5c35b2bfe83bc38585a30709f29a42a1752
# Parent  4e565c5d7b7d9835a30f46bf2873fc765c5bb645

lib: introduce string conversion functions for ASCII without further encoding concerns

Avoid the trial-and-error and vagueness of the "safe" functions.

This should replace some use of safe_unicode and safe_str. It will mostly be a
noop in py2 but will be crucial in py3.

diff --git a/kallithea/lib/utils2.py b/kallithea/lib/utils2.py
--- a/kallithea/lib/utils2.py
+++ b/kallithea/lib/utils2.py
@@ -43,7 +43,7 @@ from tg.i18n import ungettext
 from webhelpers2.text import collapse, remove_formatting, strip_tags
 
 from kallithea.lib.compat import json
-from kallithea.lib.vcs.utils import safe_bytes, safe_str, safe_unicode  # re-export
+from kallithea.lib.vcs.utils import ascii_bytes, ascii_str, safe_bytes, safe_str, safe_unicode  # re-export
 from kallithea.lib.vcs.utils.lazy import LazyProperty
 
 
diff --git a/kallithea/lib/vcs/utils/__init__.py b/kallithea/lib/vcs/utils/__init__.py
--- a/kallithea/lib/vcs/utils/__init__.py
+++ b/kallithea/lib/vcs/utils/__init__.py
@@ -1,3 +1,5 @@
+# -*- coding: utf-8 -*-
+
 """
 This module provides some useful tools for ``vcs`` like annotate/diff html
 output. It also includes some internal helpers.
@@ -121,6 +123,60 @@ def safe_bytes(s):
 safe_str = safe_bytes  # safe_str is deprecated - it will be redefined when changing to py3
 
 
+def ascii_bytes(s):
+    """
+    Simple conversion from unicode/str to bytes, *assuming* all codepoints are
+    7-bit and it thus is pure ASCII.
+    Will fail badly with UnicodeError on invalid input.
+    This should be used where enocding and "safe" ambiguity should be avoided.
+    Where strings already have been encoded in other ways but still are unicode
+    string - for example to hex, base64, json, urlencoding, or are known to be
+    identifiers.
+
+    >>> ascii_bytes('a')
+    'a'
+    >>> ascii_bytes(u'a')
+    'a'
+    >>> ascii_bytes('å')
+    Traceback (most recent call last):
+    UnicodeDecodeError: 'ascii' codec can't decode byte 0xc3 in position 0: ordinal not in range(128)
+    >>> ascii_bytes(u'å')
+    Traceback (most recent call last):
+    UnicodeEncodeError: 'ascii' codec can't encode characters in position 0-1: ordinal not in range(128)
+    """
+    assert isinstance(s, (unicode, str)), repr(s)
+    return s.encode('ascii')
+
+
+def ascii_str(s):
+    r"""
+    Simple conversion from bytes to str, *assuming* all codepoints are
+    7-bit and it thus is pure ASCII.
+    Will fail badly with UnicodeError on invalid input.
+    This should be used where enocding and "safe" ambiguity should be avoided.
+    Where strings are encoded but also in other ways are known to be ASCII, and
+    where a unicode string is wanted without caring about encoding. For example
+    to hex, base64, urlencoding, or are known to be identifiers.
+
+    >>> ascii_str('a')
+    'a'
+    >>> ascii_str(u'a')
+    Traceback (most recent call last):
+    AssertionError: u'a'
+    >>> ascii_str('å')
+    Traceback (most recent call last):
+    UnicodeDecodeError: 'ascii' codec can't decode byte 0xc3 in position 0: ordinal not in range(128)
+    >>> ascii_str(u'å')
+    Traceback (most recent call last):
+    AssertionError: u'\xc3\xa5'
+    """
+    assert isinstance(s, bytes), repr(s)
+    # Note: we use "encode", even though we really *should* use "decode". But
+    # we are in py2 and don't want py2, and encode is doing what we need for the
+    # ascii subset.
+    return s.encode('ascii')
+
+
 # Regex taken from http://www.regular-expressions.info/email.html
 email_re = re.compile(
     r"""[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*@"""