|
4 | 4 | import pytest |
5 | 5 | from playwright.async_api import Error as PlaywrightError |
6 | 6 | from scrapy import Spider |
7 | | -from scrapy_playwright._utils import _get_page_content, _NAVIGATION_ERROR_MSG |
| 7 | +from scrapy.http.headers import Headers |
| 8 | +from scrapy_playwright._utils import _get_page_content, _NAVIGATION_ERROR_MSG, _encode_body |
| 9 | + |
| 10 | + |
| 11 | +# page content retrieval |
| 12 | +# ====================== |
8 | 13 |
|
9 | 14 |
|
10 | 15 | @pytest.mark.skipif(sys.version_info < (3, 8), reason="AsyncMock was added on Python 3.8") |
@@ -67,3 +72,65 @@ async def test_get_page_content_reraise_unknown_exception(): |
67 | 72 | scrapy_request_url="https://example.org", |
68 | 73 | scrapy_request_method="GET", |
69 | 74 | ) |
| 75 | + |
| 76 | + |
| 77 | +# body encoding |
| 78 | +# ============= |
| 79 | + |
| 80 | + |
| 81 | +def body_str(charset: str, content: str = "áéíóú") -> str: |
| 82 | + return f""" |
| 83 | + <!doctype html> |
| 84 | + <html> |
| 85 | + <head> |
| 86 | + <meta charset="{charset}"> |
| 87 | + </head> |
| 88 | + <body> |
| 89 | + <p>{content}</p> |
| 90 | + </body> |
| 91 | + </html> |
| 92 | + """.strip() |
| 93 | + |
| 94 | + |
| 95 | +@pytest.mark.asyncio |
| 96 | +async def test_encode_from_headers(): |
| 97 | + """Charset declared in headers takes precedence""" |
| 98 | + text = body_str(charset="gb2312") |
| 99 | + body, encoding = _encode_body( |
| 100 | + headers=Headers({"content-type": "text/html; charset=ISO-8859-1"}), |
| 101 | + text=text, |
| 102 | + ) |
| 103 | + assert encoding == "cp1252" |
| 104 | + assert body == text.encode(encoding) |
| 105 | + |
| 106 | + |
| 107 | +@pytest.mark.asyncio |
| 108 | +async def test_encode_from_body(): |
| 109 | + """No charset declared in headers, use the one declared in the body""" |
| 110 | + text = body_str(charset="gb2312") |
| 111 | + body, encoding = _encode_body(headers=Headers({}), text=text) |
| 112 | + assert encoding == "gb18030" |
| 113 | + assert body == text.encode(encoding) |
| 114 | + |
| 115 | + |
| 116 | +@pytest.mark.asyncio |
| 117 | +async def test_encode_fallback_utf8(): |
| 118 | + """No charset declared, use utf-8 as fallback""" |
| 119 | + text = "<html>áéíóú</html>" |
| 120 | + body, encoding = _encode_body(headers=Headers(), text=text) |
| 121 | + assert encoding == "utf-8" |
| 122 | + assert body == text.encode(encoding) |
| 123 | + |
| 124 | + |
| 125 | +@pytest.mark.asyncio |
| 126 | +async def test_encode_mismatch(): |
| 127 | + """Charset declared in headers and body do not match, and the headers |
| 128 | + one fails to encode: use the one in the body (first one that works) |
| 129 | + """ |
| 130 | + text = body_str(charset="gb2312", content="空手道") |
| 131 | + body, encoding = _encode_body( |
| 132 | + headers=Headers({"content-type": "text/html; charset=ISO-8859-1"}), |
| 133 | + text=text, |
| 134 | + ) |
| 135 | + assert encoding == "gb18030" |
| 136 | + assert body == text.encode(encoding) |
0 commit comments