diff --git a/rhodecode/lib/__init__.py b/rhodecode/lib/__init__.py --- a/rhodecode/lib/__init__.py +++ b/rhodecode/lib/__init__.py @@ -23,14 +23,6 @@ # You should have received a copy of the GNU General Public License # along with this program. If not, see . - -try: - import json -except ImportError: - #python 2.5 compatibility - import simplejson as json - - def __get_lem(): from pygments import lexers from string import lower @@ -157,44 +149,69 @@ def generate_api_key(username, salt=None return hashlib.sha1(username + salt).hexdigest() -def safe_unicode(_str, from_encoding='utf8'): +def safe_unicode(str_, from_encoding='utf8'): """ - safe unicode function. In case of UnicodeDecode error we try to return - unicode with errors replaceed + safe unicode function. Does few trick to turn str_ into unicode + + In case of UnicodeDecode error we try to return it with encoding detected + by chardet library if it fails fallback to unicode with errors replaced - :param _str: string to decode + :param str_: string to decode :rtype: unicode :returns: unicode object """ + if isinstance(str_, unicode): + return str_ - if isinstance(_str, unicode): - return _str + try: + return unicode(str_) + except UnicodeDecodeError: + pass + + try: + return unicode(str_, from_encoding) + except UnicodeDecodeError: + pass try: - u_str = unicode(_str, from_encoding) - except UnicodeDecodeError: - u_str = unicode(_str, from_encoding, 'replace') - - return u_str - + import chardet + encoding = chardet.detect(str_)['encoding'] + if encoding is None: + raise Exception() + return str_.decode(encoding) + except (ImportError, UnicodeDecodeError, Exception): + return unicode(str_, from_encoding, 'replace') -def safe_str(_unicode, to_encoding='utf8'): +def safe_str(unicode_, to_encoding='utf8'): """ - safe str function. In case of UnicodeEncode error we try to return - str with errors replaceed + safe str function. Does few trick to turn unicode_ into string + + In case of UnicodeEncodeError we try to return it with encoding detected + by chardet library if it fails fallback to string with errors replaced - :param _unicode: unicode to encode + :param unicode_: unicode to encode :rtype: str :returns: str object """ - if isinstance(_unicode, str): - return _unicode + if isinstance(unicode_, str): + return unicode_ + + try: + return unicode_.encode(to_encoding) + except UnicodeEncodeError: + pass try: - safe_str = str(_unicode) - except UnicodeEncodeError: - safe_str = _unicode.encode(to_encoding, 'replace') + import chardet + encoding = chardet.detect(unicode_)['encoding'] + print encoding + if encoding is None: + raise UnicodeEncodeError() + + return unicode_.encode(encoding) + except (ImportError, UnicodeEncodeError): + return unicode_.encode(to_encoding, 'replace') return safe_str @@ -361,4 +378,4 @@ def get_changeset_safe(repo, rev): except RepositoryError: from rhodecode.lib.utils import EmptyChangeset cs = EmptyChangeset(requested_revision=rev) - return cs \ No newline at end of file + return cs