Eine Menge Aufräumarbeiten.

* Eine Testsuite um Mahrjong Ranking Berechnungen zu testen * Erste Arbeiten um die Workarounds aus dem "utils" Paket los zu werden. * Vieles am Code umformatiert für PEP8 conformität
2017-06-07 13:25:30 +02:00
parent 690ebec3b0
commit 3e9689c04a
93 changed files with 33531 additions and 2737 deletions
--- a/src/utils/html_cleaner.py
+++ b/src/utils/html_cleaner.py
@@ -4,10 +4,14 @@ Created on 19.10.2011
@author: christian
 """
 from bs4 import BeautifulSoup
-#TODO: Nach BeatutifulSoup 4 convertieren
+# TODO: Nach BeatutifulSoup 4 convertieren


 class HtmlCleaner(object):
+    """
+    Tries to clean up HTML code, to reomve all possibilities of an XSS Attack
+    and unwanted inline Javascript and CSS.
+    """
    ACCEPTABLE_ELEMENTS = ['a', 'abbr', 'acronym', 'address', 'area', 'b',
                           'big', 'blockquote', 'br', 'button', 'caption',
                           'center', 'cite',
@@ -23,7 +27,7 @@ class HtmlCleaner(object):
                           'tfoot', 'th',
                           'thead', 'tr', 'tt', 'u', 'ul', 'var']

-    ACCEPTABELE_ATTRIBUTES = [
+    ACCEPTABLE_ATTRIBUTES = [
        'abbr', 'accept', 'accept-charset', 'accesskey',
        'action', 'align', 'class', 'alt', 'axis',
        'char', 'charoff', 'charset', 'checked', 'cite', 'clear', 'cols',
@@ -35,17 +39,28 @@ class HtmlCleaner(object):
        'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title',
        'type', 'usemap', 'valign', 'value', 'vspace', 'width']

-    counter = 1
    tag_removed = False

    def clean_attributes(self, tag):
+        """
+        reomves all attributes from an element that arenÄt whitelisted.
+        :param tag: an BeautifulSoup Tag element that should be scrubbed
+        :return: None
+        """
        for attr in list(tag.attrs.keys()):
-            if attr not in self.ACCEPTABELE_ATTRIBUTES:
+            if attr not in self.ACCEPTABLE_ATTRIBUTES:
                del tag[attr]
            elif tag[attr].count('script:'):
                del tag[attr]

    def clean_tag(self, tag):
+        """
+        Removes the entire tag with all its content, when its not on the
+        whitelist. If the tag is acceptable it will be passed to
+        clean_attributes
+        :param tag: BeautifulSoup Tag element that should be scrubbed
+        :return: None
+        """
        if tag.name not in self.ACCEPTABLE_ELEMENTS:
            tag.extract()  # remove the bad ones
            self.tag_removed = True
@@ -55,12 +70,12 @@ class HtmlCleaner(object):
    def clean_html(self, fragment=''):
        """
        Reparses and cleans the html from XSS Attacks until it stops changing.
-        @param fragment:
+        :param str fragment: HTML Text that should be cleaned up
+        :return str: scrubbed HTML Text
        """
        while True:
            soup = BeautifulSoup(fragment, "html.parser")
            self.tag_removed = False
-            self.counter += 1
            for tag in soup.find_all(True):
                self.clean_tag(tag)
            fragment = str(soup)