Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ install-dev:
poetry install --all-extras
poetry run pre-commit install
poetry run playwright install
poetry run camoufox fetch

build:
poetry build --no-interaction -vv
Expand Down
11 changes: 11 additions & 0 deletions docs/examples/playwright_crawler.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,17 @@ The <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> manages

A **pre-navigation hook** can be used to perform actions before navigating to the URL. This hook provides further flexibility in controlling environment and preparing for navigation.

## How use camoufox in Playwright crawler
Camoufox is a stealthy minimalistic build of Firefox. For details please visit its homepage https://camoufox.com/
You can use Camoufox in Playwright crawler, and it requires two simple steps:
- Install required dependencies: `crawlee[playwright,camoufox]` or `crawlee[all]`
- Set input argument `browser_type` of `PlaywrightCrawler` to `camoufox` value.

**Warning!** Camoufox is using custom build of firefox. This build can be hundreds of MB large.
You can either pre-download this file using following command `python3 -m camoufox fetch` or camoufox will download it automatically once you try to run it, and it does not find existing binary.
For more details please refer to: https://github.com/daijro/camoufox/tree/main/pythonlib#camoufox-python-interface

## Playwright crawler example
<CodeBlock className="language-python">
{PlaywrightCrawlerExample}
</CodeBlock>
799 changes: 778 additions & 21 deletions poetry.lock

Large diffs are not rendered by default.

4 changes: 3 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ keywords = [
python = "^3.9"
apify = { version = ">=2.0.0", optional = true }
beautifulsoup4 = { version = ">=4.12.0", optional = true }
camoufox = {version= ">=0.4.5", extras = ["geoip"], optional = true }
colorama = ">=0.4.0"
cookiecutter = ">=2.6.0"
curl-cffi = { version = ">=0.7.2", optional = true }
Expand Down Expand Up @@ -92,12 +93,13 @@ types-psutil = "~5.9.5.20240205"
types-python-dateutil = "~2.9.0.20240316"

[tool.poetry.extras]
all = ["apify", "beautifulsoup4", "lxml", "html5lib", "curl-cffi", "playwright"]
all = ["apify", "beautifulsoup4", "lxml", "html5lib", "curl-cffi", "playwright", "camoufox"]
apify = ["apify"]
beautifulsoup = ["beautifulsoup4", "lxml", "html5lib"]
curl-impersonate = ["curl-cffi"]
playwright = ["playwright"]
parsel = ["parsel"]
camoufox = ["camoufox"]

[tool.poetry.scripts]
crawlee = "crawlee._cli:cli"
Expand Down
10 changes: 10 additions & 0 deletions src/crawlee/browsers/_playwright_browser_plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,16 @@ async def new_browser(self) -> PlaywrightBrowserController:
browser = await self._playwright.firefox.launch(**self._browser_options)
elif self._browser_type == 'webkit':
browser = await self._playwright.webkit.launch(**self._browser_options)
elif self._browser_type == 'camoufox':
try:
# Intentional late import of optional library. Majority of users might be using other browsers.
from camoufox.async_api import AsyncNewBrowser
except ImportError as e:
raise ImportError(
'Missing camoufox. It is optional component of crawlee. To fix please install crawlee'
' with following extras: crawlee[playwright,camoufox] or crawlee[all]'
) from e
browser = await AsyncNewBrowser(self._playwright, **self._browser_options)
else:
raise ValueError(f'Invalid browser type: {self._browser_type}')

Expand Down
2 changes: 1 addition & 1 deletion src/crawlee/browsers/_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
if TYPE_CHECKING:
from playwright.async_api import Page

BrowserType = Literal['chromium', 'firefox', 'webkit']
BrowserType = Literal['chromium', 'firefox', 'webkit', 'camoufox']


@dataclass
Expand Down
30 changes: 22 additions & 8 deletions tests/unit/playwright_crawler/test_playwright_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,12 @@
from __future__ import annotations

import json
from platform import system
from typing import TYPE_CHECKING
from unittest import mock

import pytest

from crawlee import Glob, Request
from crawlee.fingerprint_suite._consts import (
PW_CHROMIUM_HEADLESS_DEFAULT_SEC_CH_UA,
Expand All @@ -22,6 +25,7 @@
if TYPE_CHECKING:
from yarl import URL

from crawlee.browsers._types import BrowserType
from crawlee.playwright_crawler import PlaywrightCrawlingContext


Expand Down Expand Up @@ -111,8 +115,15 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None:
assert headers['User-Agent'] == PW_CHROMIUM_HEADLESS_DEFAULT_USER_AGENT


async def test_firefox_headless_headers(httpbin: URL) -> None:
crawler = PlaywrightCrawler(headless=True, browser_type='firefox')
@pytest.mark.parametrize(
'firefox_type',
[
'firefox',
'camoufox', # Builds on top of firefox.
],
)
async def test_firefox_headless_headers(httpbin: URL, firefox_type: BrowserType) -> None:
crawler = PlaywrightCrawler(headless=True, browser_type=firefox_type)
headers = dict[str, str]()

@crawler.router.default_handler
Expand All @@ -125,14 +136,17 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None:

await crawler.run([str(httpbin / 'get')])

assert 'User-Agent' in headers
assert 'Sec-Ch-Ua' not in headers
assert 'Sec-Ch-Ua-Mobile' not in headers
assert 'Sec-Ch-Ua-Platform' not in headers
if not (firefox_type == 'camoufox' and system() == 'Windows'):
# Camoufox seems to currently have problem with headers on Windows
# Reported camoufox issue https://github.com/daijro/camoufox/issues/79
assert 'User-Agent' in headers
assert 'Sec-Ch-Ua' not in headers
assert 'Sec-Ch-Ua-Mobile' not in headers
assert 'Sec-Ch-Ua-Platform' not in headers

assert 'headless' not in headers['User-Agent'].lower()
assert 'headless' not in headers['User-Agent'].lower()

assert headers['User-Agent'] == PW_FIREFOX_HEADLESS_DEFAULT_USER_AGENT
assert headers['User-Agent'] == PW_FIREFOX_HEADLESS_DEFAULT_USER_AGENT


async def test_custom_headers(httpbin: URL) -> None:
Expand Down