diff --git a/src/crawlee/browsers/_playwright_browser_controller.py b/src/crawlee/browsers/_playwright_browser_controller.py index efdbcedeec..1079b3f8fa 100644 --- a/src/crawlee/browsers/_playwright_browser_controller.py +++ b/src/crawlee/browsers/_playwright_browser_controller.py @@ -10,11 +10,11 @@ from crawlee.browsers._base_browser_controller import BaseBrowserController +from playwright.async_api import Browser + if TYPE_CHECKING: from collections.abc import Mapping - from playwright.async_api import Browser - from crawlee.proxy_configuration import ProxyInfo @@ -68,7 +68,10 @@ def has_free_capacity(self) -> bool: @property @override def is_browser_connected(self) -> bool: - return self._browser.is_connected() + if isinstance(self._browser, Browser): + return self._browser.is_connected() + else: + return True @override async def new_page( diff --git a/src/crawlee/browsers/_playwright_browser_plugin.py b/src/crawlee/browsers/_playwright_browser_plugin.py index 8a67b85e12..1b4b3d8e92 100644 --- a/src/crawlee/browsers/_playwright_browser_plugin.py +++ b/src/crawlee/browsers/_playwright_browser_plugin.py @@ -33,6 +33,7 @@ def __init__( browser_options: Mapping[str, Any] | None = None, page_options: Mapping[str, Any] | None = None, max_open_pages_per_browser: int = 20, + cookies: Mapping[str, Any] | None = None, ) -> None: """Create a new instance. @@ -50,6 +51,7 @@ def __init__( self._playwright_context_manager = async_playwright() self._playwright: Playwright | None = None + self._cookies = cookies @property @override @@ -93,7 +95,7 @@ async def new_browser(self) -> PlaywrightBrowserController: raise RuntimeError('Playwright browser plugin is not initialized.') if self._browser_type == 'chromium': - browser = await self._playwright.chromium.launch(**self._browser_options) + browser = await self._playwright.chromium.launch_persistent_context(user_data_dir='', **self._browser_options) elif self._browser_type == 'firefox': browser = await self._playwright.firefox.launch(**self._browser_options) elif self._browser_type == 'webkit': @@ -101,6 +103,10 @@ async def new_browser(self) -> PlaywrightBrowserController: else: raise ValueError(f'Invalid browser type: {self._browser_type}') + # TODO(Ishaan): Clean this up. + if self._cookies: + await browser.add_cookies(self._cookies) + return PlaywrightBrowserController( browser, max_open_pages_per_browser=self._max_open_pages_per_browser, diff --git a/src/crawlee/playwright_crawler/_playwright_crawler.py b/src/crawlee/playwright_crawler/_playwright_crawler.py index 5968b54634..49200e2fb2 100644 --- a/src/crawlee/playwright_crawler/_playwright_crawler.py +++ b/src/crawlee/playwright_crawler/_playwright_crawler.py @@ -143,7 +143,7 @@ async def enqueue_links( link_user_data.setdefault('label', label) try: - request = BaseRequestData.from_url(url, user_data=link_user_data) + request = BaseRequestData.from_url(url, user_data=link_user_data, keep_url_fragment=True) except ValidationError as exc: context.log.debug( f'Skipping URL "{url}" due to invalid format: {exc}. '