Changeset - 0f69b5c35b2b
[Not reviewed]
default
0 2 0
Mads Kiilerich - 6 years ago 2019-12-27 23:30:56
mads@kiilerich.com
Grafted from: 57ed7fc231aa
lib: introduce string conversion functions for ASCII without further encoding concerns

Avoid the trial-and-error and vagueness of the "safe" functions.

This should replace some use of safe_unicode and safe_str. It will mostly be a
noop in py2 but will be crucial in py3.
2 files changed with 57 insertions and 1 deletions:
0 comments (0 inline, 0 general)
kallithea/lib/utils2.py
Show inline comments
 
@@ -22,49 +22,49 @@ models, controllers, etc.  to prevent im
 
This file was forked by the Kallithea project in July 2014.
 
Original author and date, and relevant copyright and licensing information is below:
 
:created_on: Jan 5, 2011
 
:author: marcink
 
:copyright: (c) 2013 RhodeCode GmbH, and others.
 
:license: GPLv3, see LICENSE.md for more details.
 
"""
 

	
 
from __future__ import print_function
 

	
 
import binascii
 
import datetime
 
import os
 
import pwd
 
import re
 
import time
 
import urllib
 

	
 
import urlobject
 
from tg.i18n import ugettext as _
 
from tg.i18n import ungettext
 
from webhelpers2.text import collapse, remove_formatting, strip_tags
 

	
 
from kallithea.lib.compat import json
 
from kallithea.lib.vcs.utils import safe_bytes, safe_str, safe_unicode  # re-export
 
from kallithea.lib.vcs.utils import ascii_bytes, ascii_str, safe_bytes, safe_str, safe_unicode  # re-export
 
from kallithea.lib.vcs.utils.lazy import LazyProperty
 

	
 

	
 
def str2bool(_str):
 
    """
 
    returns True/False value from given string, it tries to translate the
 
    string into boolean
 

	
 
    :param _str: string value to translate into boolean
 
    :rtype: boolean
 
    :returns: boolean from given string
 
    """
 
    if _str is None:
 
        return False
 
    if _str in (True, False):
 
        return _str
 
    _str = str(_str).strip().lower()
 
    return _str in ('t', 'true', 'y', 'yes', 'on', '1')
 

	
 

	
 
def aslist(obj, sep=None, strip=True):
 
    """
 
    Returns given string separated by sep as list
 

	
kallithea/lib/vcs/utils/__init__.py
Show inline comments
 
# -*- coding: utf-8 -*-
 

	
 
"""
 
This module provides some useful tools for ``vcs`` like annotate/diff html
 
output. It also includes some internal helpers.
 
"""
 

	
 
import datetime
 
import re
 
import time
 

	
 

	
 
def makedate():
 
    lt = time.localtime()
 
    if lt[8] == 1 and time.daylight:
 
        tz = time.altzone
 
    else:
 
        tz = time.timezone
 
    return time.mktime(lt), tz
 

	
 

	
 
def aslist(obj, sep=None, strip=True):
 
    """
 
    Returns given string separated by sep as list
 

	
 
    :param obj:
 
@@ -100,48 +102,102 @@ def safe_unicode(s):
 
def safe_bytes(s):
 
    """
 
    Safe bytes function. Use a few tricks to turn s into bytes string:
 
    In case of UnicodeEncodeError with configured default encodings, fall back
 
    to first configured encoding with errors replaced.
 
    """
 
    if isinstance(s, bytes):
 
        return s
 

	
 
    assert isinstance(s, unicode), repr(s)  # bytes cannot coerse with __str__ or handle None or int
 

	
 
    from kallithea.lib.vcs.conf import settings
 
    for enc in settings.DEFAULT_ENCODINGS:
 
        try:
 
            return s.encode(enc)
 
        except UnicodeEncodeError:
 
            pass
 

	
 
    return s.encode(settings.DEFAULT_ENCODINGS[0], 'replace')
 

	
 

	
 
safe_str = safe_bytes  # safe_str is deprecated - it will be redefined when changing to py3
 

	
 

	
 
def ascii_bytes(s):
 
    """
 
    Simple conversion from unicode/str to bytes, *assuming* all codepoints are
 
    7-bit and it thus is pure ASCII.
 
    Will fail badly with UnicodeError on invalid input.
 
    This should be used where enocding and "safe" ambiguity should be avoided.
 
    Where strings already have been encoded in other ways but still are unicode
 
    string - for example to hex, base64, json, urlencoding, or are known to be
 
    identifiers.
 

	
 
    >>> ascii_bytes('a')
 
    'a'
 
    >>> ascii_bytes(u'a')
 
    'a'
 
    >>> ascii_bytes('å')
 
    Traceback (most recent call last):
 
    UnicodeDecodeError: 'ascii' codec can't decode byte 0xc3 in position 0: ordinal not in range(128)
 
    >>> ascii_bytes(u'å')
 
    Traceback (most recent call last):
 
    UnicodeEncodeError: 'ascii' codec can't encode characters in position 0-1: ordinal not in range(128)
 
    """
 
    assert isinstance(s, (unicode, str)), repr(s)
 
    return s.encode('ascii')
 

	
 

	
 
def ascii_str(s):
 
    r"""
 
    Simple conversion from bytes to str, *assuming* all codepoints are
 
    7-bit and it thus is pure ASCII.
 
    Will fail badly with UnicodeError on invalid input.
 
    This should be used where enocding and "safe" ambiguity should be avoided.
 
    Where strings are encoded but also in other ways are known to be ASCII, and
 
    where a unicode string is wanted without caring about encoding. For example
 
    to hex, base64, urlencoding, or are known to be identifiers.
 

	
 
    >>> ascii_str('a')
 
    'a'
 
    >>> ascii_str(u'a')
 
    Traceback (most recent call last):
 
    AssertionError: u'a'
 
    >>> ascii_str('å')
 
    Traceback (most recent call last):
 
    UnicodeDecodeError: 'ascii' codec can't decode byte 0xc3 in position 0: ordinal not in range(128)
 
    >>> ascii_str(u'å')
 
    Traceback (most recent call last):
 
    AssertionError: u'\xc3\xa5'
 
    """
 
    assert isinstance(s, bytes), repr(s)
 
    # Note: we use "encode", even though we really *should* use "decode". But
 
    # we are in py2 and don't want py2, and encode is doing what we need for the
 
    # ascii subset.
 
    return s.encode('ascii')
 

	
 

	
 
# Regex taken from http://www.regular-expressions.info/email.html
 
email_re = re.compile(
 
    r"""[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*@"""
 
    r"""(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?""",
 
    re.IGNORECASE)
 

	
 

	
 
def author_email(author):
 
    """
 
    Returns email address of given author string.
 
    If author contains <> brackets, only look inside that.
 
    If any RFC valid email address is found, return that.
 
    Else, return empty string.
 

	
 
    """
 
    if not author:
 
        return ''
 

	
 
    l = author.find('<') + 1
 
    if l != 0:
 
        r = author.find('>', l)
 
        if r != -1:
 
            author = author[l:r]
 

	
0 comments (0 inline, 0 general)