增加数据请求的重试机制

This commit is contained in:
zhl 2019-05-09 11:32:15 +08:00
parent 645aef2c70
commit d9f106d9ef

View File

@ -15,8 +15,38 @@ const maxIdx = 100000;
* */
const parseOnePage = async ({subLink, category, sortIdx}) => {
const url = `${URL_BASE}${subLink}`
let html;
try {
html = await proxyUtil.getDataProxy(url)
await CrawlRecord.updateRecord({url: url,
className: 'dandanzan',
methodName: 'parseOnePage',
params: {subLink, category, sortIdx},
lastStatus: true,
})
} catch(err) {
console.log('parse page with network error, try again :', url);
try {
html = await proxyUtil.getDataProxy(url)
await CrawlRecord.updateRecord({url: url,
className: 'dandanzan',
methodName: 'parseOnePage',
params: {subLink, category, sortIdx},
lastStatus: true,
})
} catch (err2) {
await CrawlRecord.updateRecord({url: url,
className: 'dandanzan',
methodName: 'parseOnePage',
params: {subLink, category, sortIdx},
lastStatus: false,
})
}
}
if (!html) {
return false;
}
try {
let html = await proxyUtil.getDataProxy(url)
const $ = cheerio.load(html);
if ($('.error404').text()) {
console.log(`>>>>>>>>>>>> ${url} not found`);
@ -60,22 +90,10 @@ const parseOnePage = async ({subLink, category, sortIdx}) => {
record.sortIdx = sortIdx;
}
await record.save();
await CrawlRecord.updateRecord({url: url,
className: 'dandanzan',
methodName: 'parseOnePage',
params: {subLink, category, sortIdx},
lastStatus: true,
})
console.log(`@@@@@ ${sortIdx}: ${subLink} @ ${record.name} saved`);
}
} catch (err) {
console.log(err);
await CrawlRecord.updateRecord({url: url,
className: 'dandanzan',
methodName: 'parseOnePage',
params: {subLink, category, sortIdx},
lastStatus: false,
})
}
}
@ -109,13 +127,23 @@ const parseListPage = async ({idx, category}) => {
lastStatus: true,
})
} catch (err) {
console.log(err);
await CrawlRecord.updateRecord({url: url,
className: 'dandanzan',
methodName: 'parseListPage',
params: {idx, category},
lastStatus: false,
})
console.log('parse page with network error, try again :', url);
try {
html = await proxyUtil.getDataProxy(url)
await CrawlRecord.updateRecord({url: url,
className: 'dandanzan',
methodName: 'parseListPage',
params: {idx, category},
lastStatus: true,
})
} catch (err2) {
await CrawlRecord.updateRecord({url: url,
className: 'dandanzan',
methodName: 'parseListPage',
params: {idx, category},
lastStatus: false,
})
}
}
if (html) {
const $ = cheerio.load(html);
@ -142,17 +170,6 @@ const parseListPage = async ({idx, category}) => {
} else {
return 1;
}
// if ($('.next-page')) {
// let nextStr = $('.next-page a').attr('href');
// console.log('has next page: ', nextStr);
// try {
// await parseListPage(nextStr, category);
// } catch (err) {
// console.log(err);
// }
// } else {
// console.log('########################### ALL LIST PAGE END ###########################');
// }
} else {
return 1;
}