Complete stubs for bleach (#9314)

Co-authored-by: Alex Waygood <Alex.Waygood@Gmail.com> Co-authored-by: Avasam <samuel.06@hotmail.com>
2026-03-16 11:34:56 +08:00 · 2024-02-21 23:03:24 +03:00
parent 78b7dc6167
commit a2095002e4
9 changed files with 128 additions and 66 deletions
--- a/pyrightconfig.stricter.json
+++ b/pyrightconfig.stricter.json
@@ -27,7 +27,6 @@
        "stubs/antlr4-python3-runtime",
        "stubs/aws-xray-sdk",
        "stubs/beautifulsoup4",
-        "stubs/bleach",
        "stubs/boltons",
        "stubs/boto",
        "stubs/braintree",
--- a/stubs/bleach/@tests/stubtest_allowlist.txt
+++ b/stubs/bleach/@tests/stubtest_allowlist.txt
@@ -1,2 +1,5 @@
-bleach.css_sanitizer  # Requires tinycss2 to be installed
-bleach.html5lib_shim.*
+# Internal private stuff:
+bleach._vendor.*
+
+# Is a property returning a method, simplified:
+bleach.html5lib_shim.InputStreamWithMemory.changeEncoding
--- a/stubs/bleach/METADATA.toml
+++ b/stubs/bleach/METADATA.toml
@@ -1,6 +1,6 @@
 version = "6.1.*"
+requires = ["types-html5lib"]
 upstream_repository = "https://github.com/mozilla/bleach"
-partial_stub = true

 [tool.stubtest]
-ignore_missing_stub = true
+extras = ["css"]
--- a/stubs/bleach/bleach/css_sanitizer.pyi
+++ b/stubs/bleach/bleach/css_sanitizer.pyi
@@ -1,7 +1,8 @@
 from collections.abc import Container
+from typing import Final

-ALLOWED_CSS_PROPERTIES: frozenset[str]
-ALLOWED_SVG_PROPERTIES: frozenset[str]
+ALLOWED_CSS_PROPERTIES: Final[frozenset[str]]
+ALLOWED_SVG_PROPERTIES: Final[frozenset[str]]

 class CSSSanitizer:
    allowed_css_properties: Container[str]
--- a/stubs/bleach/bleach/html5lib_shim.pyi
+++ b/stubs/bleach/bleach/html5lib_shim.pyi
@@ -1,30 +1,70 @@
-from _typeshed import Incomplete
+import re
+from codecs import CodecInfo
 from collections.abc import Generator, Iterable, Iterator
+from typing import Any, Final, Protocol

-class HTMLParser:  # actually html5lib.HTMLParser
-    def __getattr__(self, __name: str) -> Incomplete: ...
+# We don't re-export any `html5lib` types / values here, because they are not
+# really public and may change at any time. This is just a helper module,
+# import things directly from `html5lib` instead!
+from html5lib import HTMLParser
+from html5lib._inputstream import HTMLBinaryInputStream, HTMLUnicodeInputStream
+from html5lib._tokenizer import HTMLTokenizer
+from html5lib._trie import Trie
+from html5lib.serializer import HTMLSerializer
+from html5lib.treewalkers.base import TreeWalker

-class Filter:  # actually html5lib.filters.base.Filter
-    source: Incomplete
-    def __init__(self, source) -> None: ...
-    def __iter__(self) -> Iterator[Incomplete]: ...
-    def __getattr__(self, name: str) -> Incomplete: ...  # copy attributes from source
+# Is actually webencodings.Encoding
+class _Encoding(Protocol):
+    name: str
+    codec_info: CodecInfo
+    def __init__(self, name: str, codec_info: CodecInfo) -> None: ...

-class SanitizerFilter:  # actually html5lib.filters.sanitizer.Filter
-    def __getattr__(self, __name: str) -> Incomplete: ...
+HTML_TAGS: Final[frozenset[str]]
+HTML_TAGS_BLOCK_LEVEL: Final[frozenset[str]]
+AMP_SPLIT_RE: Final[re.Pattern[str]]
+ENTITIES: Final[dict[str, str]]
+ENTITIES_TRIE: Final[Trie]
+TAG_TOKEN_TYPES: Final[set[int]]
+TAG_TOKEN_TYPE_CHARACTERS: Final[int]
+TAG_TOKEN_TYPE_END: Final[int]
+TAG_TOKEN_TYPE_PARSEERROR: Final[int]
+TAG_TOKEN_TYPE_START: Final[int]

-class HTMLSerializer:  # actually html5lib.serializer.HTMLSerializer
-    def __getattr__(self, __name: str) -> Incomplete: ...
+class InputStreamWithMemory:
+    position = HTMLUnicodeInputStream.position
+    reset = HTMLUnicodeInputStream.reset
+    def __init__(self, inner_stream: HTMLUnicodeInputStream) -> None: ...
+    @property
+    def errors(self) -> list[str]: ...
+    @property
+    def charEncoding(self) -> tuple[_Encoding, str]: ...
+    # If inner_stream wasn't a HTMLBinaryInputStream, this will error at runtime
+    # Is a property returning a method, simplified:
+    changeEncoding = HTMLBinaryInputStream.changeEncoding
+    def char(self) -> str: ...
+    def charsUntil(self, characters: Iterable[str], opposite: bool = False) -> str: ...
+    def unget(self, char: str | None) -> None: ...
+    def get_tag(self) -> str: ...
+    def start_tag(self) -> None: ...
+
+class BleachHTMLTokenizer(HTMLTokenizer):
+    consume_entities: bool
+    stream: InputStreamWithMemory
+    emitted_last_token: dict[str, Any] | None
+    def __init__(self, consume_entities: bool = False, **kwargs: Any) -> None: ...

 class BleachHTMLParser(HTMLParser):
    tags: list[str] | None
    strip: bool
    consume_entities: bool
-    def __init__(self, tags: Iterable[str] | None, strip: bool, consume_entities: bool, **kwargs) -> None: ...
+    def __init__(self, tags: Iterable[str] | None, strip: bool, consume_entities: bool, **kwargs: Any) -> None: ...

 class BleachHTMLSerializer(HTMLSerializer):
    escape_rcdata: bool
    def escape_base_amp(self, stoken: str) -> Generator[str, None, None]: ...
-    def serialize(self, treewalker, encoding: str | None = None) -> Generator[str, None, None]: ...
+    def serialize(self, treewalker: TreeWalker, encoding: str | None = None) -> Generator[str, None, None]: ...  # type: ignore[override]

-def __getattr__(__name: str) -> Incomplete: ...
+def convert_entity(value: str) -> str | None: ...
+def convert_entities(text: str) -> str: ...
+def match_entity(stream: str) -> str | None: ...
+def next_possible_entity(text: str) -> Iterator[str]: ...
--- a/stubs/bleach/bleach/linkifier.pyi
+++ b/stubs/bleach/bleach/linkifier.pyi
@@ -1,22 +1,25 @@
 from _typeshed import Incomplete
-from collections.abc import Container, Iterable, Iterator
+from collections.abc import Container, Iterable, Iterator, Sequence
 from re import Pattern
+from typing import Any, Final
+from typing_extensions import TypeAlias

-from .callbacks import _Callback
-from .html5lib_shim import Filter
+from html5lib.filters.base import Filter
+from html5lib.treewalkers.base import TreeWalker

-DEFAULT_CALLBACKS: list[_Callback]
+from .callbacks import _Callback, _HTMLAttrs

-TLDS: list[str]
+DEFAULT_CALLBACKS: Final[list[_Callback]]
+TLDS: Final[list[str]]

 def build_url_re(tlds: Iterable[str] = ..., protocols: Iterable[str] = ...) -> Pattern[str]: ...

-URL_RE: Pattern[str]
-PROTO_RE: Pattern[str]
+URL_RE: Final[Pattern[str]]
+PROTO_RE: Final[Pattern[str]]

 def build_email_re(tlds: Iterable[str] = ...) -> Pattern[str]: ...

-EMAIL_RE: Pattern[str]
+EMAIL_RE: Final[Pattern[str]]

 class Linker:
    def __init__(
@@ -30,6 +33,10 @@ class Linker:
    ) -> None: ...
    def linkify(self, text: str) -> str: ...

+# TODO: `_Token` might be converted into `TypedDict`
+# or `html5lib` token might be reused
+_Token: TypeAlias = dict[str, Any]
+
 class LinkifyFilter(Filter):
    callbacks: Iterable[_Callback]
    skip_tags: Container[str]
@@ -38,18 +45,18 @@ class LinkifyFilter(Filter):
    email_re: Pattern[str]
    def __init__(
        self,
-        source,
+        source: TreeWalker,
        callbacks: Iterable[_Callback] | None = ...,
        skip_tags: Container[str] | None = None,
        parse_email: bool = False,
        url_re: Pattern[str] = ...,
        email_re: Pattern[str] = ...,
    ) -> None: ...
-    def apply_callbacks(self, attrs, is_new): ...
-    def extract_character_data(self, token_list): ...
-    def handle_email_addresses(self, src_iter): ...
-    def strip_non_url_bits(self, fragment): ...
-    def handle_links(self, src_iter): ...
-    def handle_a_tag(self, token_buffer): ...
-    def extract_entities(self, token): ...
+    def apply_callbacks(self, attrs: _HTMLAttrs, is_new: bool) -> _HTMLAttrs | None: ...
+    def extract_character_data(self, token_list: Iterable[_Token]) -> str: ...
+    def handle_email_addresses(self, src_iter: Iterable[_Token]) -> Iterator[_Token]: ...
+    def strip_non_url_bits(self, fragment: str) -> tuple[str, str, str]: ...
+    def handle_links(self, src_iter: Iterable[_Token]) -> Iterator[_Token]: ...
+    def handle_a_tag(self, token_buffer: Sequence[_Token]) -> Iterator[_Token]: ...
+    def extract_entities(self, token: _Token) -> Iterator[_Token]: ...
    def __iter__(self) -> Iterator[Incomplete]: ...
--- a/stubs/bleach/bleach/parse_shim.pyi
+++ b/stubs/bleach/bleach/parse_shim.pyi
@@ -0,0 +1 @@
+from urllib import parse as parse
--- a/stubs/bleach/bleach/sanitizer.pyi
+++ b/stubs/bleach/bleach/sanitizer.pyi
@@ -1,20 +1,27 @@
 from _typeshed import Incomplete
-from collections.abc import Callable, Iterable
+from collections.abc import Callable, Container, Iterable, Iterator
 from re import Pattern
-from typing import Protocol
+from typing import Final, Protocol
 from typing_extensions import TypeAlias

+from html5lib.filters.base import Filter
+from html5lib.filters.sanitizer import Filter as SanitizerFilter
+from html5lib.treewalkers.base import TreeWalker
+
 from . import _HTMLAttrKey
 from .css_sanitizer import CSSSanitizer
-from .html5lib_shim import BleachHTMLParser, BleachHTMLSerializer, SanitizerFilter
+from .html5lib_shim import BleachHTMLParser, BleachHTMLSerializer
+from .linkifier import _Token

-ALLOWED_TAGS: frozenset[str]
-ALLOWED_ATTRIBUTES: dict[str, list[str]]
-ALLOWED_PROTOCOLS: frozenset[str]
+ALLOWED_TAGS: Final[frozenset[str]]
+ALLOWED_ATTRIBUTES: Final[dict[str, list[str]]]
+ALLOWED_PROTOCOLS: Final[frozenset[str]]

-INVISIBLE_CHARACTERS: str
-INVISIBLE_CHARACTERS_RE: Pattern[str]
-INVISIBLE_REPLACEMENT_CHAR: str
+INVISIBLE_CHARACTERS: Final[str]
+INVISIBLE_CHARACTERS_RE: Final[Pattern[str]]
+INVISIBLE_REPLACEMENT_CHAR: Final = "?"
+
+class NoCssSanitizerWarning(UserWarning): ...

 # A html5lib Filter class
 class _Filter(Protocol):
@@ -24,18 +31,16 @@ _AttributeFilter: TypeAlias = Callable[[str, str, str], bool]
 _AttributeDict: TypeAlias = dict[str, list[str] | _AttributeFilter] | dict[str, list[str]] | dict[str, _AttributeFilter]
 _Attributes: TypeAlias = _AttributeFilter | _AttributeDict | list[str]

-_TreeWalker: TypeAlias = Callable[[Incomplete], Incomplete]
-
 class Cleaner:
    tags: Iterable[str]
    attributes: _Attributes
    protocols: Iterable[str]
    strip: bool
    strip_comments: bool
-    filters: Iterable[_Filter]
+    filters: Iterable[Filter]
    css_sanitizer: CSSSanitizer | None
    parser: BleachHTMLParser
-    walker: _TreeWalker
+    walker: TreeWalker
    serializer: BleachHTMLSerializer
    def __init__(
        self,
@@ -63,7 +68,7 @@ class BleachSanitizerFilter(SanitizerFilter):
    css_sanitizer: CSSSanitizer | None
    def __init__(
        self,
-        source,
+        source: TreeWalker,
        allowed_tags: Iterable[str] = ...,
        attributes: _Attributes = ...,
        allowed_protocols: Iterable[str] = ...,
@@ -74,12 +79,11 @@ class BleachSanitizerFilter(SanitizerFilter):
        strip_html_comments: bool = True,
        css_sanitizer: CSSSanitizer | None = None,
    ) -> None: ...
-    def sanitize_stream(self, token_iterator): ...
-    def merge_characters(self, token_iterator): ...
-    def __iter__(self): ...
-    def sanitize_token(self, token): ...
-    def sanitize_characters(self, token): ...
-    def sanitize_uri_value(self, value, allowed_protocols): ...
-    def allow_token(self, token): ...
-    def disallowed_token(self, token): ...
-    def sanitize_css(self, style): ...
+    def sanitize_stream(self, token_iterator: Iterable[_Token]) -> Iterator[_Token]: ...
+    def merge_characters(self, token_iterator: Iterable[_Token]) -> Iterator[_Token]: ...
+    def __iter__(self) -> Iterator[_Token]: ...
+    def sanitize_token(self, token: _Token) -> _Token | list[_Token] | None: ...
+    def sanitize_characters(self, token: _Token) -> _Token | list[_Token]: ...
+    def sanitize_uri_value(self, value: str, allowed_protocols: Container[str]) -> str | None: ...
+    def allow_token(self, token: _Token) -> _Token: ...
+    def disallowed_token(self, token: _Token) -> _Token: ...
--- a/stubs/html5lib/html5lib/_inputstream.pyi
+++ b/stubs/html5lib/html5lib/_inputstream.pyi
@@ -1,7 +1,14 @@
 from _typeshed import Incomplete, SupportsRead
-from typing import Any, overload
+from codecs import CodecInfo
+from typing import Any, Protocol, overload
 from typing_extensions import TypeAlias

+# Is actually webencodings.Encoding
+class _Encoding(Protocol):
+    name: str
+    codec_info: CodecInfo
+    def __init__(self, name: str, codec_info: CodecInfo) -> None: ...
+
 _UnicodeInputStream: TypeAlias = str | SupportsRead[str]
 _BinaryInputStream: TypeAlias = bytes | SupportsRead[bytes]
 _InputStream: TypeAlias = _UnicodeInputStream  # noqa: Y047  # used in other files
@@ -42,13 +49,13 @@ def HTMLInputStream(
 class HTMLUnicodeInputStream:
    reportCharacterErrors: Any
    newLines: Any
-    charEncoding: Any
+    charEncoding: tuple[_Encoding, str]
    dataStream: Any
    def __init__(self, source: _UnicodeInputStream) -> None: ...
    chunk: str
    chunkSize: int
    chunkOffset: int
-    errors: Any
+    errors: list[str]
    prevNumLines: int
    prevNumCols: int
    def reset(self) -> None: ...
@@ -70,7 +77,7 @@ class HTMLBinaryInputStream(HTMLUnicodeInputStream):
    same_origin_parent_encoding: Any
    likely_encoding: Any
    default_encoding: Any
-    charEncoding: Any
+    charEncoding: tuple[_Encoding, str]
    def __init__(
        self,
        source: _BinaryInputStream,
@@ -85,7 +92,7 @@ class HTMLBinaryInputStream(HTMLUnicodeInputStream):
    def reset(self) -> None: ...
    def openStream(self, source): ...
    def determineEncoding(self, chardet: bool = True): ...
-    def changeEncoding(self, newEncoding) -> None: ...
+    def changeEncoding(self, newEncoding: str | bytes | None) -> None: ...
    def detectBOM(self): ...
    def detectEncodingMeta(self): ...