import bleach from bleach.linkifier import LinkifyFilter from django.utils.safestring import mark_safe def allow_a(tag: str, name: str, value: str): if name in ["href", "title", "class"]: return True elif name == "rel": # Only allow rel attributes with a small subset of values # (we're defending against, for example, rel=me) rel_values = value.split() if all(v in ["nofollow", "noopener", "noreferrer", "tag"] for v in rel_values): return True return False def sanitize_post(post_html: str) -> str: """ Only allows a, br, p and span tags, and class attributes. """ cleaner = bleach.Cleaner( tags=["br", "p"], attributes={ # type:ignore "a": allow_a, "p": ["class"], "span": ["class"], }, filters=[LinkifyFilter], strip=True, ) return mark_safe(cleaner.clean(post_html)) def strip_html(post_html: str) -> str: """ Strips all tags from the text, then linkifies it. """ cleaner = bleach.Cleaner(tags=[], strip=True, filters=[LinkifyFilter]) return mark_safe(cleaner.clean(post_html)) def html_to_plaintext(post_html: str) -> str: """ Tries to do the inverse of the linebreaks filter. """ # TODO: Handle HTML entities # Remove all newlines, then replace br with a newline and /p with two (one comes from bleach) post_html = post_html.replace("\n", "").replace("
", "\n").replace("

", "\n") # Remove all other HTML and return cleaner = bleach.Cleaner(tags=[], strip=True, filters=[]) return cleaner.clean(post_html).strip()