javascript - Puppeteer timeout on specific website running on server cloud headless - Stack Overflow

I've made a node.js web scraper code that runs fine on my puter, however, when I deploy to my Goog

I've made a node.js web scraper code that runs fine on my puter, however, when I deploy to my Google Cloud VM instance running Debian, it returns a timeout error for a specific website. I've tried many differnent setups for puppeteer, but none seems to work. I believe the website I'm trying to scrape is blocking my code when I run from the google cloud server, but not when I run from my puter. The scraping part is working fine on my puter. Puppeteer finds the HTML tags and retrieve the info.

const puppeteer = require('puppeteer');
const GoogleSpreadsheet = require('google-spreadsheet');
const { promisify } = require('util');
const credentials = require('./credentials.json');

async function main(){

    const scrapCopasa = await scrapCopasaFunction();

    console.log('Done!')

}



async function scrapCopasaFunction() {

    const browser = await puppeteer.launch({
        args: ['--no-sandbox'], 
    });
    const page = await browser.newPage();
    //await page.setDefaultNavigationTimeout(0);
    //await page.setViewport({width: 1366, height: 768});
    await page.setUserAgent('Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36');
    await page.goto('');
    //await new Promise(resolve => setTimeout(resolve, 5000));
    
    let isUsernameNotFound = await page.evaluate(() => {
        if(document.getElementsByClassName('h2')[0]) {
            if(document.getElementsByTagName('h2')[0].textContent == "Sorry, this page isn't available.") {
                return true;
            }
        }
    });

    if(isUsernameNotFound) {
        console.log('Account not exists!');        
        await browser.close();
        return;
    }


    let reservoirLevelsCopasa = await page.evaluate(() => {
        const tds = Array.from(document.querySelectorAll('table tr td'))
        return tds.map(td => td.innerText)        
    });


    const riomanso = reservoirLevelsCopasa[13].replace(",",".").substring(0,5);
    const serraazul = reservoirLevelsCopasa[17].replace(",",".").substring(0,5);
    const vargemdasflores = reservoirLevelsCopasa[21].replace(",",".").substring(0,5);

    await browser.close();

    return[riomanso, serraazul, vargemdasflores];

}



main();

And error that I'm getting is the following:

(node:6425) UnhandledPromiseRejectionWarning: TimeoutError: Navigation Timeout Exceeded: 30000ms exceeded
    at /home/xxx/reservoirs/node_modules/puppeteer/lib/LifecycleWatcher.js:142:21
    at async FrameManager.navigateFrame (/home/xxx/reservoirs/node_modules/puppeteer/lib/FrameManager.js:94:17)
    at async Frame.goto (/home/xxx/reservoirs/node_modules/puppeteer/lib/FrameManager.js:406:12)
    at async Page.goto (/home/xxx/reservoirs/node_modules/puppeteer/lib/Page.js:674:12)
    at async scrapCopasaFunction (/home/xxx/reservoirs/reservatorios.js:129:5)
    at async main (/home/xxx/reservoirs/reservatorios.js:9:25)
  -- ASYNC --
    at Frame.<anonymous> (/home/xxx/reservoirs/node_modules/puppeteer/lib/helper.js:111:15)
    at Page.goto (/home/xxx/reservoirs/node_modules/puppeteer/lib/Page.js:674:49)
    at Page.<anonymous> (/home/xxx/reservoirs/node_modules/puppeteer/lib/helper.js:112:23)
    at scrapCopasaFunction (/home/xxx/reservoirs/reservatorios.js:129:16)
    at processTicksAndRejections (internal/process/task_queues.js:93:5)
    at async main (/home/xxx/reservoirs/reservatorios.js:9:25)
(Use `node --trace-warnings ...` to show where the warning was created)
(node:6425) UnhandledPromiseRejectionWarning: Unhandled promise rejection. This error originated either by throwing inside of an async f
unction without a catch block, or by rejecting a promise which was not handled with .catch(). To terminate the node process on unhandled
 promise rejection, use the CLI flag `--unhandled-rejections=strict` (see .html#cli_unhandled_rejections_mode)
. (rejection id: 1)
(node:6425) [DEP0018] DeprecationWarning: Unhandled promise rejections are deprecated. In the future, promise rejections that are not ha
ndled will terminate the Node.js process with a non-zero exit code.

I've made a node.js web scraper code that runs fine on my puter, however, when I deploy to my Google Cloud VM instance running Debian, it returns a timeout error for a specific website. I've tried many differnent setups for puppeteer, but none seems to work. I believe the website I'm trying to scrape is blocking my code when I run from the google cloud server, but not when I run from my puter. The scraping part is working fine on my puter. Puppeteer finds the HTML tags and retrieve the info.

const puppeteer = require('puppeteer');
const GoogleSpreadsheet = require('google-spreadsheet');
const { promisify } = require('util');
const credentials = require('./credentials.json');

async function main(){

    const scrapCopasa = await scrapCopasaFunction();

    console.log('Done!')

}



async function scrapCopasaFunction() {

    const browser = await puppeteer.launch({
        args: ['--no-sandbox'], 
    });
    const page = await browser.newPage();
    //await page.setDefaultNavigationTimeout(0);
    //await page.setViewport({width: 1366, height: 768});
    await page.setUserAgent('Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36');
    await page.goto('http://www.copasa..br/wps/portal/internet/abastecimento-de-agua/nivel-dos-reservatorios');
    //await new Promise(resolve => setTimeout(resolve, 5000));
    
    let isUsernameNotFound = await page.evaluate(() => {
        if(document.getElementsByClassName('h2')[0]) {
            if(document.getElementsByTagName('h2')[0].textContent == "Sorry, this page isn't available.") {
                return true;
            }
        }
    });

    if(isUsernameNotFound) {
        console.log('Account not exists!');        
        await browser.close();
        return;
    }


    let reservoirLevelsCopasa = await page.evaluate(() => {
        const tds = Array.from(document.querySelectorAll('table tr td'))
        return tds.map(td => td.innerText)        
    });


    const riomanso = reservoirLevelsCopasa[13].replace(",",".").substring(0,5);
    const serraazul = reservoirLevelsCopasa[17].replace(",",".").substring(0,5);
    const vargemdasflores = reservoirLevelsCopasa[21].replace(",",".").substring(0,5);

    await browser.close();

    return[riomanso, serraazul, vargemdasflores];

}



main();

And error that I'm getting is the following:

(node:6425) UnhandledPromiseRejectionWarning: TimeoutError: Navigation Timeout Exceeded: 30000ms exceeded
    at /home/xxx/reservoirs/node_modules/puppeteer/lib/LifecycleWatcher.js:142:21
    at async FrameManager.navigateFrame (/home/xxx/reservoirs/node_modules/puppeteer/lib/FrameManager.js:94:17)
    at async Frame.goto (/home/xxx/reservoirs/node_modules/puppeteer/lib/FrameManager.js:406:12)
    at async Page.goto (/home/xxx/reservoirs/node_modules/puppeteer/lib/Page.js:674:12)
    at async scrapCopasaFunction (/home/xxx/reservoirs/reservatorios.js:129:5)
    at async main (/home/xxx/reservoirs/reservatorios.js:9:25)
  -- ASYNC --
    at Frame.<anonymous> (/home/xxx/reservoirs/node_modules/puppeteer/lib/helper.js:111:15)
    at Page.goto (/home/xxx/reservoirs/node_modules/puppeteer/lib/Page.js:674:49)
    at Page.<anonymous> (/home/xxx/reservoirs/node_modules/puppeteer/lib/helper.js:112:23)
    at scrapCopasaFunction (/home/xxx/reservoirs/reservatorios.js:129:16)
    at processTicksAndRejections (internal/process/task_queues.js:93:5)
    at async main (/home/xxx/reservoirs/reservatorios.js:9:25)
(Use `node --trace-warnings ...` to show where the warning was created)
(node:6425) UnhandledPromiseRejectionWarning: Unhandled promise rejection. This error originated either by throwing inside of an async f
unction without a catch block, or by rejecting a promise which was not handled with .catch(). To terminate the node process on unhandled
 promise rejection, use the CLI flag `--unhandled-rejections=strict` (see https://nodejs/api/cli.html#cli_unhandled_rejections_mode)
. (rejection id: 1)
(node:6425) [DEP0018] DeprecationWarning: Unhandled promise rejections are deprecated. In the future, promise rejections that are not ha
ndled will terminate the Node.js process with a non-zero exit code.
Share Improve this question edited Jan 28, 2021 at 19:58 DisappointedByUnaccountableMod 6,8464 gold badges20 silver badges23 bronze badges asked Jan 10, 2021 at 20:08 grcgrc 3241 gold badge6 silver badges23 bronze badges
Add a ment  | 

2 Answers 2

Reset to default 5

The cloud functions are a bit slow for puppeteer. There were a GitHub issue #3120. regarding this. You can assign more CPU/ram for the function, if that's a possibility. The more CPU and RAM you provide for chrome, the faster it will be.

You can add a timeout to goto, which is maximum navigation time in milliseconds, defaults to 30 seconds, pass 0 to disable timeout.

await page.goto('http://www.copasa..br', { timeout: 60000 });

You can also setup the navigation timeout with setDefaultTimeout and setDefaultNavigationTimeout which takes priority over setDefaultTimeout.

page.setDefaultNavigationTimeout(60000)

The data you're extracting is already in HTML, so you can fetch HTML with HTTP request and extract data in Node.js script instead of the browser. This will be faster and require fewer resources. If you need to authenticate, you can send a POST request and reuse the cookie in the following GET request. Example in this answer.

Full example

const cheerio = require('cheerio')
const got = require('got')

const URL = 'http://www.copasa..br/wps/portal/internet/abastecimento-de-agua/nivel-dos-reservatorios'

function reportAndExit (error) {
  console.error(error)
  process.exit(1)
}

async function main () {
  const headers = {
    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'
  }

  const response = await got(URL, headers)
  const $ = cheerio.load(response.body)

  const reservoirLevelsCopasa = $('#conteudo-principal table:first-of-type tr:nth-of-type(n+3) td:nth-child(4)').map((i, el) => parseFloat($(el).text().replace(',', '.'))).get()

  console.log(reservoirLevelsCopasa)

  return reservoirLevelsCopasa
}

main().catch(reportAndExit)

Output

[ 83.4, 88.8, 85.9 ]

发布者:admin,转转请注明出处:http://www.yc00.com/questions/1745338546a4623219.html

相关推荐

发表回复

评论列表(0条)

  • 暂无评论

联系我们

400-800-8888

在线咨询: QQ交谈

邮件:admin@example.com

工作时间:周一至周五,9:30-18:30,节假日休息

关注微信