python - How to bypass "Too many open files" error when using Playwright? - Stack Overflow

I'm using Playwright to crawl websites. I have a scraping function that uses Playwright, and imple

I'm using Playwright to crawl websites. I have a scraping function that uses Playwright, and implemented a Python object that uses this function to crawl websites in a bread-first search manner.

Below is the scraping function:

import logging
from collections import deque

from playwright.async_api import Browser, BrowserContext, async_playwright


async def fetch_page_content(
    url: str,
    browser: Browser = None,
    context: BrowserContext = None,
    open_pages: deque = None,
    max_open_pages: int = 100,
    timeout: int = 60000,
    headless: bool = True,
    logger: logging.Logger = None,
) -> str | None:
    should_close_browser = browser is None
    should_close_context = context is None

    if should_close_browser:
        p = await async_playwright().start()
        browser = await p.chromium.launch(headless=headless)

    if should_close_context:
        context = await browser.new_context(
            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36"
        )

    if open_pages is not None:
        if len(open_pages) >= max_open_pages:
            old_page = open_pages.popleft()
            await old_page.close()

    page = await context.new_page()

    if open_pages is not None:
        open_pages.append(page)

    try:
        response = await page.goto(url, timeout=timeout, wait_until="load")
        if not response or response.status >= 400:
            if logger:
                logger.error(f"Failed to fetch {url}")
            return None
        html = await page.content()
        return html
    except Exception as e:
        if logger:
            logger.warning(f"Error fetching {url}: {e}")
        return None
    finally:
        await page.close()
        if open_pages is not None and page in open_pages:
            open_pages.remove(page)
        if should_close_context:
            await context.close()
        if should_close_browser:
            await browser.close()

Inside of my crawler, the part that uses this function is as follows:


async with async_playwright() as p:
    browser = await p.chromium.launch(headless=True)
    context = await browser.new_context()

    total_pages = 0

    while self.queue:
        batch = []
        for _ in range(self.parallel_requests):
            if not self.queue:
                break

            url, depth = self.queue.popleft()

            if url in self.visited_urls or depth > self.max_depth:
                continue

            should_skip_url_, reason = should_skip_url(url=url)

            if should_skip_url_:
                self.logger.info(f"Skipping {url}: {reason}")
                continue

            total_pages += 1
            self.logger.info(f"[{total_pages}] Crawling: {url} (Depth: {depth})")

            self.visited_urls.add(url)
            batch.append((url, depth))

        self.logger.info(f"open_pages size before fetching batch: {len(self.open_pages)}")
        tasks = [
            fetch_page_content(
                url=url,
                context=context,
                open_pages=self.open_pages,
                max_open_pages=self.max_open_pages,
                logger=self.logger,
            )
            for url, depth in batch
        ]

        html_results = await asyncio.gather(*tasks, return_exceptions=True)
        self.logger.info(f"open_pages size after fetching batch: {len(self.open_pages)}")

        for (url, depth), html_result in zip(batch, html_results):
            processing_successful = await self.process_and_save_fetched_html(url=url, html=html_result)
            if not processing_successful:
                continue

            links = await self.extract_links(html=html_result, base_url=url)
            await self.validate_and_enqueue_links(url=url, links=links, depth=depth)

        if total_pages % self.restart_interval == 0 and total_pages != 0:
            self.logger.info("Restarting browser and context...")
            await context.close()
            await browser.close()
            browser = await p.chromium.launch(headless=True)
            context = await browser.new_context()

What I've tried are as follows:

  1. Limit concurrency by using lower values of parallel_requests.
  2. Manually close pages using the open_pages deque.
  3. Restarting the browser when we hit an interval.

Playwright's throwing an Errno 24 saying that there are too many files open and it failed to fetch the webpage.

I'm sort of at a dead end now and am not sure what else I can do. I would rather not manually raise the system's values for maximum open pages and take care of it at the code level, if possible.

发布者:admin,转转请注明出处:http://www.yc00.com/questions/1744668752a4586912.html

相关推荐

发表回复

评论列表(0条)

  • 暂无评论

联系我们

400-800-8888

在线咨询: QQ交谈

邮件:admin@example.com

工作时间:周一至周五,9:30-18:30,节假日休息

关注微信