javascript - node.js - Control a queue of Promises - Stack Overflow

I'm writing a crawler, which will get data from an e-merce website, using node.js. Each of my inpu

I'm writing a crawler, which will get data from an e-merce website, using node.js. Each of my input to fetch contains:

  • url: URL of that link
  • directory: Directory name into which the output file should be written later
  • page: Parameter to query

Each page will fetch a number of items, each of them will be fetched in details later

This is my fetchPage promise (agent is require('superagent')) that will fetch HTML text:

function fetchPage(url,page){
    return new Promise(
        (resolve,reject)=>{
            if (page>0){
                agent
                .get(url)
                .send('page='+page)
                .end(function(err,res){
                    if (err){
                        reject(err);
                    } else{
                        resolve(res.text);
                    }
                });
            } else{
                agent
                .get(url)
                .end(function(err,res){
                    if (err){
                        reject(err);
                    } else{
                        resolve(res.text);
                    }
                });
            }

        });
}

Global calls:

var data=[];
for (var i=1;i<=links[0].numOfPages;i++){
    data.push({
        url:links[0].url,
        directory:links[0].directory,
        page:i
    });
}

const promises=data.reduce(
    (promise,data)=>promise.then(()=>{
        fetchPage(data.url,data.page).then(
            (result)=>{
                const urls=getUrls(result);
                Promise.all(urls.map((url,i)=>fetchPage(url,0).then(
                        (result)=>{
                            var item=getItem(result);
                            item.url=url;
                            writeItem(item,data.directory,data.page,i+1);
                        },
                        (error)=>console.log(error)
                )));
            });
    }),
    Promise.resolve());

promises.then((values)=>console.log('All done'));

There are 3 functions you will see as utilities (all of them work properly):

  • getUrls: Process HTML text of a page, returning an array of urls of items to crawl in details later
  • getItem: Process HTML text of an item's detailed page, returning an object that will be written into file
  • writeItem: Write an object to file, provided with directory and page number to make proper directory and write and store

There is a problem I have been encountering:

  • How can I rebuild it using a queue of promises in which each promise will run one-by-one and one-after-another orderly and synchronously and only allows a limited number of promises running concurrently?

How to do it properly and efficiently? How should I change with these current code? I need some demo also

I deleted fetchItem function because of its innecessity (actually, it calls fetchPage with page = 0), now I only utilize fetchPage

I'm writing a crawler, which will get data from an e-merce website, using node.js. Each of my input to fetch contains:

  • url: URL of that link
  • directory: Directory name into which the output file should be written later
  • page: Parameter to query

Each page will fetch a number of items, each of them will be fetched in details later

This is my fetchPage promise (agent is require('superagent')) that will fetch HTML text:

function fetchPage(url,page){
    return new Promise(
        (resolve,reject)=>{
            if (page>0){
                agent
                .get(url)
                .send('page='+page)
                .end(function(err,res){
                    if (err){
                        reject(err);
                    } else{
                        resolve(res.text);
                    }
                });
            } else{
                agent
                .get(url)
                .end(function(err,res){
                    if (err){
                        reject(err);
                    } else{
                        resolve(res.text);
                    }
                });
            }

        });
}

Global calls:

var data=[];
for (var i=1;i<=links[0].numOfPages;i++){
    data.push({
        url:links[0].url,
        directory:links[0].directory,
        page:i
    });
}

const promises=data.reduce(
    (promise,data)=>promise.then(()=>{
        fetchPage(data.url,data.page).then(
            (result)=>{
                const urls=getUrls(result);
                Promise.all(urls.map((url,i)=>fetchPage(url,0).then(
                        (result)=>{
                            var item=getItem(result);
                            item.url=url;
                            writeItem(item,data.directory,data.page,i+1);
                        },
                        (error)=>console.log(error)
                )));
            });
    }),
    Promise.resolve());

promises.then((values)=>console.log('All done'));

There are 3 functions you will see as utilities (all of them work properly):

  • getUrls: Process HTML text of a page, returning an array of urls of items to crawl in details later
  • getItem: Process HTML text of an item's detailed page, returning an object that will be written into file
  • writeItem: Write an object to file, provided with directory and page number to make proper directory and write and store

There is a problem I have been encountering:

  • How can I rebuild it using a queue of promises in which each promise will run one-by-one and one-after-another orderly and synchronously and only allows a limited number of promises running concurrently?

How to do it properly and efficiently? How should I change with these current code? I need some demo also

I deleted fetchItem function because of its innecessity (actually, it calls fetchPage with page = 0), now I only utilize fetchPage

Share Improve this question edited Apr 18, 2016 at 14:26 necroface asked Apr 17, 2016 at 4:08 necrofacenecroface 3,46513 gold badges48 silver badges71 bronze badges 1
  • 1 You have a classic case of asynchornous action inside of a for loop, see JavaScript closure inside loops – simple practical example. Use let page instead of var page as a very simple solution. – Madara's Ghost Commented Apr 17, 2016 at 8:26
Add a ment  | 

2 Answers 2

Reset to default 6

For your case, I suggest that you install the Bluebird Promise library, because it provides a couple of utilities that you can use.

For your questions, Normally, you don't use for loops in conjunction with Promises, you construct an array of data, and a mapping function that returns a Promise, then either .map() + Promise.all() or .reduce() the array into a single Promise, that resolves when everything has pleted.

Bluebird's Promise.map() also allows you to specify a concurrency option, that will limit how many actions can run simultaneously.


Here are a few examples to get you started:

Running async actions concurrently

const Promise = require('bluebird');
const urls = ['https://url1.', 'https://url2.', ... ]; // lots of urls
// {concurrency: 4} means only 4 URLs are processed at any given time.
const allPromise = Promise.map(urls, fetchUrlAsync, {concurrency: 4});
allPromise.then(allValues => {
  // Deal with all results in order of original array
});

Running async actions sequentially:

const Promise = require('bluebird');
const urls = ['https://url1.', 'https://url2.', ... ]; // lots of urls
// {concurrency: 4} means only 4 URLs are processed at any given time.
const allPromise = urls.reduce((promise, url) => 
  // Start with an empty promise, chain all calls on top of that
  promise.then(() => fetchUrlAsync(url)), Promise.resolve()); 
allPromise.then(allValues => {
  // Deal with all results in order of original array
});

Try to think of things as collections of values, and the actions you perform on those values, abstract your actions into functions, and call them when appropriate, don't mix fetching with writing in the same place.

First of all, if you want to really control your execution, then you should not construct a loop to call a promise. It will be executed immediately. Instead, you should construct some data to be supplied to the promise. And sorry, I don't really understand your program flow. I can see that you're calling fetchPage and after it pletes, calls fetchItem, which calls fetchPage again. That's maybe why you're getting double callback.

For your second question, here's an example on how you could process each link serially, and process pages in a link in parallel with maximum 3 concurrent jobs.

var Promise = require('bluebird');
var chance = new (require('chance'))();

var fetchPage = (url, page) => new Promise((resolve, reject) => {
    // Simulate Network Operation
    if (page === 0) {
        console.log('Start Downloading: ' + url);
        setTimeout(() => {
            resolve({
                url: url,
                content: 'Content of ' + url
            });
        }, chance.integer({ min: 10, max: 250 }));
    } else {
        console.log('Start Downloading: ' + url + '?page=' + page);
        setTimeout(() => {
            resolve({
                url: url + '?page=' + page,
                content: 'Content of ' + url + '?page=' + page
            });
        }, chance.integer({ min: 10, max: 250 }));
    }
});

var fetchItem = link => {
    // Get the data to be supplied to fetchPage promise
    var data = [];
    for (var i = 0; i <= link.numOfPages; i++) {
        data.push({
            url: link.url,
            page: i
        });
    }
    return data;
};

var writeItem = (item, directory) => {
    // Simulate Writing to Directory
    console.log('Writing ' + item + ' to ' + directory + ' folder');
};

// Make some dummy links
var links = [];
for (var i = 0; i < 10; i++) {
    var domain = chance.domain();
    links.push({
        url: chance.url({ domain: domain }),
        directory: domain,
        numOfPages: chance.integer({ min: 0, max: 5 })
    });
}

// Process each URL serially
Promise.each(links, link => Promise.map(fetchItem(link), data => fetchPage(data.url, data.page).then(result => {
    writeItem(result.content, link.directory);
    console.log('Done Fetching: ' + result.url);
}), {
    // Control the number of concurrent job
    concurrency: 3
})).then(() => {
    console.log('All Done!!');
});

UPDATE: A simpler example to demonstrate Promise.each and Promise.map

var Promise = require('bluebird');
var chance = new (require('chance'))();

var tasks = [];

for (var i = 1; i <= chance.integer({ min: 10, max: 20 }); i++) {
    var jobs = [];
    for (var j = 1; j <= chance.integer({ min: 2, max: 10 }); j++) {
        jobs.push({
            job_name: 'Job ' + j
        });
    }

    tasks.push({
        task_name: 'Task ' + i,
        jobs: jobs
    });
}

Promise.each(tasks, task => Promise.map(task.jobs, job => new Promise((resolve, reject) => {
    setTimeout(() => resolve(task.task_name + ' ' + job.job_name), chance.integer({ min: 20, max: 150 }));
}).then(log => console.log(log)), {
    concurrency: 3
}).then(() => console.log())).then(() => {
    console.log('All Done!!');
});

In this example you can clearly see that each task is run sequentially and each job in a task is run in parallel with a maximum of 3 concurrent jobs at a time.

发布者:admin,转转请注明出处:http://www.yc00.com/questions/1743676355a4488630.html

相关推荐

  • javascript - node.js - Control a queue of Promises - Stack Overflow

    I'm writing a crawler, which will get data from an e-merce website, using node.js. Each of my inpu

    22小时前
    10

发表回复

评论列表(0条)

  • 暂无评论

联系我们

400-800-8888

在线咨询: QQ交谈

邮件:admin@example.com

工作时间:周一至周五,9:30-18:30,节假日休息

关注微信