I'm writing a crawler, which will get data from an e-merce website, using node.js. Each of my input to fetch contains:
url
: URL of that linkdirectory
: Directory name into which the output file should be written laterpage
: Parameter to query
Each page will fetch a number of items, each of them will be fetched in details later
This is my fetchPage
promise (agent
is require('superagent')
) that will fetch HTML text:
function fetchPage(url,page){
return new Promise(
(resolve,reject)=>{
if (page>0){
agent
.get(url)
.send('page='+page)
.end(function(err,res){
if (err){
reject(err);
} else{
resolve(res.text);
}
});
} else{
agent
.get(url)
.end(function(err,res){
if (err){
reject(err);
} else{
resolve(res.text);
}
});
}
});
}
Global calls:
var data=[];
for (var i=1;i<=links[0].numOfPages;i++){
data.push({
url:links[0].url,
directory:links[0].directory,
page:i
});
}
const promises=data.reduce(
(promise,data)=>promise.then(()=>{
fetchPage(data.url,data.page).then(
(result)=>{
const urls=getUrls(result);
Promise.all(urls.map((url,i)=>fetchPage(url,0).then(
(result)=>{
var item=getItem(result);
item.url=url;
writeItem(item,data.directory,data.page,i+1);
},
(error)=>console.log(error)
)));
});
}),
Promise.resolve());
promises.then((values)=>console.log('All done'));
There are 3 functions you will see as utilities (all of them work properly):
getUrls
: Process HTML text of a page, returning an array of urls of items to crawl in details latergetItem
: Process HTML text of an item's detailed page, returning an object that will be written into filewriteItem
: Write an object to file, provided with directory and page number to make proper directory and write and store
There is a problem I have been encountering:
- How can I rebuild it using a queue of promises in which each promise will run one-by-one and one-after-another orderly and synchronously and only allows a limited number of promises running concurrently?
How to do it properly and efficiently? How should I change with these current code? I need some demo also
I deleted fetchItem
function because of its innecessity (actually, it calls fetchPage
with page = 0
), now I only utilize fetchPage
I'm writing a crawler, which will get data from an e-merce website, using node.js. Each of my input to fetch contains:
url
: URL of that linkdirectory
: Directory name into which the output file should be written laterpage
: Parameter to query
Each page will fetch a number of items, each of them will be fetched in details later
This is my fetchPage
promise (agent
is require('superagent')
) that will fetch HTML text:
function fetchPage(url,page){
return new Promise(
(resolve,reject)=>{
if (page>0){
agent
.get(url)
.send('page='+page)
.end(function(err,res){
if (err){
reject(err);
} else{
resolve(res.text);
}
});
} else{
agent
.get(url)
.end(function(err,res){
if (err){
reject(err);
} else{
resolve(res.text);
}
});
}
});
}
Global calls:
var data=[];
for (var i=1;i<=links[0].numOfPages;i++){
data.push({
url:links[0].url,
directory:links[0].directory,
page:i
});
}
const promises=data.reduce(
(promise,data)=>promise.then(()=>{
fetchPage(data.url,data.page).then(
(result)=>{
const urls=getUrls(result);
Promise.all(urls.map((url,i)=>fetchPage(url,0).then(
(result)=>{
var item=getItem(result);
item.url=url;
writeItem(item,data.directory,data.page,i+1);
},
(error)=>console.log(error)
)));
});
}),
Promise.resolve());
promises.then((values)=>console.log('All done'));
There are 3 functions you will see as utilities (all of them work properly):
getUrls
: Process HTML text of a page, returning an array of urls of items to crawl in details latergetItem
: Process HTML text of an item's detailed page, returning an object that will be written into filewriteItem
: Write an object to file, provided with directory and page number to make proper directory and write and store
There is a problem I have been encountering:
- How can I rebuild it using a queue of promises in which each promise will run one-by-one and one-after-another orderly and synchronously and only allows a limited number of promises running concurrently?
How to do it properly and efficiently? How should I change with these current code? I need some demo also
I deleted fetchItem
function because of its innecessity (actually, it calls fetchPage
with page = 0
), now I only utilize fetchPage
-
1
You have a classic case of asynchornous action inside of a for loop, see JavaScript closure inside loops – simple practical example. Use
let page
instead ofvar page
as a very simple solution. – Madara's Ghost Commented Apr 17, 2016 at 8:26
2 Answers
Reset to default 6For your case, I suggest that you install the Bluebird Promise library, because it provides a couple of utilities that you can use.
For your questions, Normally, you don't use for loops in conjunction with Promises, you construct an array of data, and a mapping function that returns a Promise, then either .map() + Promise.all()
or .reduce()
the array into a single Promise, that resolves when everything has pleted.
Bluebird's Promise.map()
also allows you to specify a concurrency option, that will limit how many actions can run simultaneously.
Here are a few examples to get you started:
Running async actions concurrently
const Promise = require('bluebird');
const urls = ['https://url1.', 'https://url2.', ... ]; // lots of urls
// {concurrency: 4} means only 4 URLs are processed at any given time.
const allPromise = Promise.map(urls, fetchUrlAsync, {concurrency: 4});
allPromise.then(allValues => {
// Deal with all results in order of original array
});
Running async actions sequentially:
const Promise = require('bluebird');
const urls = ['https://url1.', 'https://url2.', ... ]; // lots of urls
// {concurrency: 4} means only 4 URLs are processed at any given time.
const allPromise = urls.reduce((promise, url) =>
// Start with an empty promise, chain all calls on top of that
promise.then(() => fetchUrlAsync(url)), Promise.resolve());
allPromise.then(allValues => {
// Deal with all results in order of original array
});
Try to think of things as collections of values, and the actions you perform on those values, abstract your actions into functions, and call them when appropriate, don't mix fetching with writing in the same place.
First of all, if you want to really control your execution, then you should not construct a loop to call a promise. It will be executed immediately. Instead, you should construct some data to be supplied to the promise. And sorry, I don't really understand your program flow. I can see that you're calling fetchPage
and after it pletes, calls fetchItem
, which calls fetchPage
again. That's maybe why you're getting double callback.
For your second question, here's an example on how you could process each link serially, and process pages in a link in parallel with maximum 3 concurrent jobs.
var Promise = require('bluebird');
var chance = new (require('chance'))();
var fetchPage = (url, page) => new Promise((resolve, reject) => {
// Simulate Network Operation
if (page === 0) {
console.log('Start Downloading: ' + url);
setTimeout(() => {
resolve({
url: url,
content: 'Content of ' + url
});
}, chance.integer({ min: 10, max: 250 }));
} else {
console.log('Start Downloading: ' + url + '?page=' + page);
setTimeout(() => {
resolve({
url: url + '?page=' + page,
content: 'Content of ' + url + '?page=' + page
});
}, chance.integer({ min: 10, max: 250 }));
}
});
var fetchItem = link => {
// Get the data to be supplied to fetchPage promise
var data = [];
for (var i = 0; i <= link.numOfPages; i++) {
data.push({
url: link.url,
page: i
});
}
return data;
};
var writeItem = (item, directory) => {
// Simulate Writing to Directory
console.log('Writing ' + item + ' to ' + directory + ' folder');
};
// Make some dummy links
var links = [];
for (var i = 0; i < 10; i++) {
var domain = chance.domain();
links.push({
url: chance.url({ domain: domain }),
directory: domain,
numOfPages: chance.integer({ min: 0, max: 5 })
});
}
// Process each URL serially
Promise.each(links, link => Promise.map(fetchItem(link), data => fetchPage(data.url, data.page).then(result => {
writeItem(result.content, link.directory);
console.log('Done Fetching: ' + result.url);
}), {
// Control the number of concurrent job
concurrency: 3
})).then(() => {
console.log('All Done!!');
});
UPDATE: A simpler example to demonstrate Promise.each
and Promise.map
var Promise = require('bluebird');
var chance = new (require('chance'))();
var tasks = [];
for (var i = 1; i <= chance.integer({ min: 10, max: 20 }); i++) {
var jobs = [];
for (var j = 1; j <= chance.integer({ min: 2, max: 10 }); j++) {
jobs.push({
job_name: 'Job ' + j
});
}
tasks.push({
task_name: 'Task ' + i,
jobs: jobs
});
}
Promise.each(tasks, task => Promise.map(task.jobs, job => new Promise((resolve, reject) => {
setTimeout(() => resolve(task.task_name + ' ' + job.job_name), chance.integer({ min: 20, max: 150 }));
}).then(log => console.log(log)), {
concurrency: 3
}).then(() => console.log())).then(() => {
console.log('All Done!!');
});
In this example you can clearly see that each task is run sequentially and each job in a task is run in parallel with a maximum of 3 concurrent jobs at a time.
发布者:admin,转转请注明出处:http://www.yc00.com/questions/1743676355a4488630.html
评论列表(0条)