Changeset - 2ac4499b25eb
[Not reviewed]
default
0 1 0
Thomas De Schampheleire - 7 years ago 2019-01-26 20:27:50
thomas.de_schampheleire@nokia.com
lib: sanitize HTML for all types of README rendering, not only markdown

The repository summary page will display a rendered version of the
repository 'readme' based on its file extension. In commit 5746cc3b3fa5,
the rendered output was already sanitized when the input was markdown.
However, also readmes written in other formats, like ReStructuredText (RST)
or plain text could have content that we want sanitized.

Therefore, move the sanitizing one level up so it covers all renderers, for
now and the future.

This fixes an XSS issue when a repository readme contains javascript code,
which would be executed when the repository summary page is visited by a
user.

Reported by Bob Hogg <wombat@rwhogg.site> (thanks!).
1 file changed with 17 insertions and 17 deletions:
0 comments (0 inline, 0 general)
kallithea/lib/markup_renderer.py
Show inline comments
 
@@ -103,102 +103,102 @@ class MarkupRenderer(object):
 
                return matchobj.group(0)
 
        pattern = re.compile(r'^[\w\<][^\n]*(\n+)', re.MULTILINE)
 
        text = re.sub(pattern, newline_callback, text)
 

	
 
        # Insert pre block extractions.
 
        def pre_insert_callback(matchobj):
 
            return '\n\n' + extractions[matchobj.group(1)]
 
        text = re.sub(r'{gfm-extraction-([0-9a-f]{32})\}',
 
                      pre_insert_callback, text)
 

	
 
        return text
 

	
 
    def render(self, source, filename=None):
 
        """
 
        Renders a given filename using detected renderer
 
        it detects renderers based on file extension or mimetype.
 
        At last it will just do a simple html replacing new lines with <br/>
 

	
 
        :param file_name:
 
        :param source:
 
        """
 

	
 
        renderer = self._detect_renderer(source, filename)
 
        readme_data = renderer(source)
 
        return readme_data
 
        # Allow most HTML, while preventing XSS issues:
 
        # no <script> tags, no onclick attributes, no javascript
 
        # "protocol", and also limit styling to prevent defacing.
 
        return bleach.clean(readme_data,
 
            tags=['a', 'abbr', 'b', 'blockquote', 'br', 'code', 'dd',
 
                  'div', 'dl', 'dt', 'em', 'h1', 'h2', 'h3', 'h4', 'h5',
 
                  'h6', 'hr', 'i', 'img', 'li', 'ol', 'p', 'pre', 'span',
 
                  'strong', 'sub', 'sup', 'table', 'tbody', 'td', 'th',
 
                  'thead', 'tr', 'ul'],
 
            attributes=['class', 'id', 'style', 'label', 'title', 'alt', 'href', 'src'],
 
            styles=['color'],
 
            protocols=['http', 'https', 'mailto'],
 
            )
 

	
 
    @classmethod
 
    def plain(cls, source, universal_newline=True):
 
        source = safe_unicode(source)
 
        if universal_newline:
 
            newline = '\n'
 
            source = newline.join(source.splitlines())
 

	
 
        def url_func(match_obj):
 
            url_full = match_obj.group(0)
 
            return '<a href="%(url)s">%(url)s</a>' % ({'url': url_full})
 
        source = url_re.sub(url_func, source)
 
        return '<br />' + source.replace("\n", '<br />')
 

	
 
    @classmethod
 
    def markdown(cls, source, safe=True, flavored=False):
 
        """
 
        Convert Markdown (possibly GitHub Flavored) to XSS safe HTML, possibly
 
        with "safe" fall-back to plaintext.
 

	
 
        >>> MarkupRenderer.markdown('''<img id="a" style="margin-top:-1000px;color:red" src="http://example.com/test.jpg">''')
 
        u'<p><img id="a" src="http://example.com/test.jpg" style="color: red;"></p>'
 
        >>> MarkupRenderer.markdown('''<img class="c d" src="file://localhost/test.jpg">''')
 
        u'<p><img class="c d"></p>'
 
        >>> MarkupRenderer.markdown('''<a href="foo">foo</a>''')
 
        u'<p><a href="foo">foo</a></p>'
 
        >>> MarkupRenderer.markdown('''<script>alert(1)</script>''')
 
        u'&lt;script&gt;alert(1)&lt;/script&gt;'
 
        >>> MarkupRenderer.markdown('''<div onclick="alert(2)">yo</div>''')
 
        u'<div>yo</div>'
 
        >>> MarkupRenderer.markdown('''<a href="javascript:alert(3)">yo</a>''')
 
        u'<p><a>yo</a></p>'
 
        """
 
        source = safe_unicode(source)
 
        try:
 
            if flavored:
 
                source = cls._flavored_markdown(source)
 
            markdown_html = markdown_mod.markdown(source,
 
                                       extensions=['codehilite', 'extra'],
 
                                       extension_configs={'codehilite': {'css_class': 'code-highlight'}})
 
            # Allow most HTML, while preventing XSS issues:
 
            # no <script> tags, no onclick attributes, no javascript
 
            # "protocol", and also limit styling to prevent defacing.
 
            return bleach.clean(markdown_html,
 
                tags=['a', 'abbr', 'b', 'blockquote', 'br', 'code', 'dd',
 
                      'div', 'dl', 'dt', 'em', 'h1', 'h2', 'h3', 'h4', 'h5',
 
                      'h6', 'hr', 'i', 'img', 'li', 'ol', 'p', 'pre', 'span',
 
                      'strong', 'sub', 'sup', 'table', 'tbody', 'td', 'th',
 
                      'thead', 'tr', 'ul'],
 
                attributes=['class', 'id', 'style', 'label', 'title', 'alt', 'href', 'src'],
 
                styles=['color'],
 
                protocols=['http', 'https', 'mailto'],
 
                )
 
            return markdown_mod.markdown(
 
                source,
 
                extensions=['codehilite', 'extra'],
 
                extension_configs={'codehilite': {'css_class': 'code-highlight'}})
 
        except Exception:
 
            log.error(traceback.format_exc())
 
            if safe:
 
                log.debug('Falling back to render in plain mode')
 
                return cls.plain(source)
 
            else:
 
                raise
 

	
 
    @classmethod
 
    def rst(cls, source, safe=True):
 
        source = safe_unicode(source)
 
        try:
 
            from docutils.core import publish_parts
 
            from docutils.parsers.rst import directives
 
            docutils_settings = dict([(alias, None) for alias in
 
                                cls.RESTRUCTUREDTEXT_DISALLOWED_DIRECTIVES])
 

	
 
            docutils_settings.update({'input_encoding': 'unicode',
 
                                      'report_level': 4})
 

	
 
            for k, v in docutils_settings.iteritems():
 
                directives.register_directive(k, v)
 

	
 
            parts = publish_parts(source=source,
0 comments (0 inline, 0 general)