# HG changeset patch # User Mads Kiilerich # Date 2016-02-15 19:29:26 # Node ID 1fd82c81118df05c1619d1d0f34eb870a76e91d0 # Parent b6c702202f8290b345cb42e320c07fdaaa5472b1 vcs: better handling of invalid email addresses: don't consider them email addresses 13da89053853 was in principle right in always returning email adresses as string ... but unfortunately the function also returned invalid email addresses that didn't fit into strings. To fix this, the function is refactored to always use regexp matching of valid email addresses ... and to be simpler. The behaviour should be the same as before for all valid email addresses. diff --git a/kallithea/lib/vcs/utils/__init__.py b/kallithea/lib/vcs/utils/__init__.py --- a/kallithea/lib/vcs/utils/__init__.py +++ b/kallithea/lib/vcs/utils/__init__.py @@ -5,6 +5,7 @@ output. It also includes some internal h import time import datetime +import re def makedate(): @@ -150,30 +151,33 @@ def safe_str(unicode_, to_encoding=None) return unicode_.encode(to_encoding[0], 'replace') +# Regex taken from http://www.regular-expressions.info/email.html +email_re = re.compile( + r"""[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*@""" + r"""(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?""", + re.IGNORECASE) + def author_email(author): """ - returns email address of given author. - If any of <,> sign are found, it fallbacks to regex findall() - and returns first found result or empty string + Returns email address of given author string. + If author contains <> brackets, only look inside that. + If any RFC valid email address is found, return that. + Else, return empty string. - Regex taken from http://www.regular-expressions.info/email.html """ if not author: return '' - import re - r = author.find('>') - l = author.find('<') - if l == -1 or r == -1: - # fallback to regex match of email out of a string - email_re = re.compile(r"""[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!""" - r"""#$%&'*+/=?^_`{|}~-]+)*@(?:[a-z0-9](?:[a-z""" - r"""0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]""" - r"""*[a-z0-9])?""", re.IGNORECASE) - m = re.findall(email_re, author) - return safe_str(m[0]) if m else '' + l = author.find('<') + 1 + if l != 0: + r = author.find('>', l) + if r != -1: + author = author[l:r] - return safe_str(author[l + 1:r].strip()) + m = email_re.search(author) + if m is None: + return '' + return safe_str(m.group(0)) def author_name(author): diff --git a/kallithea/tests/vcs/test_git.py b/kallithea/tests/vcs/test_git.py --- a/kallithea/tests/vcs/test_git.py +++ b/kallithea/tests/vcs/test_git.py @@ -604,7 +604,7 @@ class GitChangesetTest(unittest.TestCase self.assertEqual('lukasz.balcerzak@python-center.pl', self.repo.get_changeset('ff7ca51e58c505fec0dd2491de52c622bb7a806b') \ .author_email) - self.assertEqual('none@none', + self.assertEqual('', self.repo.get_changeset('8430a588b43b5d6da365400117c89400326e7992') \ .author_email) @@ -615,7 +615,7 @@ class GitChangesetTest(unittest.TestCase self.assertEqual('Lukasz Balcerzak', self.repo.get_changeset('ff7ca51e58c505fec0dd2491de52c622bb7a806b') \ .author_name) - self.assertEqual('marcink', + self.assertEqual('marcink none@none', self.repo.get_changeset('8430a588b43b5d6da365400117c89400326e7992') \ .author_name) diff --git a/kallithea/tests/vcs/test_utils.py b/kallithea/tests/vcs/test_utils.py --- a/kallithea/tests/vcs/test_utils.py +++ b/kallithea/tests/vcs/test_utils.py @@ -206,7 +206,7 @@ class TestAuthorExtractors(unittest.Test ('Mr Double Name withemail@example.com ', ('Mr Double Name', 'withemail@example.com')), (u'John Doe <джондо à éẋàṁṗłê.ç°ḿ>', - (u'John Doe <\u0434\u0436\u043e\u043d\u0434\u043e \xe0 \xe9\u1e8b\xe0\u1e41\u1e57\u0142\xea.\xe7\xb0\u1e3f>', '\xd0\xb4\xd0\xb6\xd0\xbe\xd0\xbd\xd0\xb4\xd0\xbe \xc3\xa0 \xc3\xa9\xe1\xba\x8b\xc3\xa0\xe1\xb9\x81\xe1\xb9\x97\xc5\x82\xc3\xaa.\xc3\xa7\xc2\xb0\xe1\xb8\xbf')), + (u'John Doe <\u0434\u0436\u043e\u043d\u0434\u043e \xe0 \xe9\u1e8b\xe0\u1e41\u1e57\u0142\xea.\xe7\xb0\u1e3f>', '')), ] def test_author_email(self):