增加数据请求的重试机制

This commit is contained in:
zhl 2019-05-09 11:32:15 +08:00
parent 645aef2c70
commit d9f106d9ef

View File

@ -15,8 +15,38 @@ const maxIdx = 100000;
* */ * */
const parseOnePage = async ({subLink, category, sortIdx}) => { const parseOnePage = async ({subLink, category, sortIdx}) => {
const url = `${URL_BASE}${subLink}` const url = `${URL_BASE}${subLink}`
let html;
try {
html = await proxyUtil.getDataProxy(url)
await CrawlRecord.updateRecord({url: url,
className: 'dandanzan',
methodName: 'parseOnePage',
params: {subLink, category, sortIdx},
lastStatus: true,
})
} catch(err) {
console.log('parse page with network error, try again :', url);
try {
html = await proxyUtil.getDataProxy(url)
await CrawlRecord.updateRecord({url: url,
className: 'dandanzan',
methodName: 'parseOnePage',
params: {subLink, category, sortIdx},
lastStatus: true,
})
} catch (err2) {
await CrawlRecord.updateRecord({url: url,
className: 'dandanzan',
methodName: 'parseOnePage',
params: {subLink, category, sortIdx},
lastStatus: false,
})
}
}
if (!html) {
return false;
}
try { try {
let html = await proxyUtil.getDataProxy(url)
const $ = cheerio.load(html); const $ = cheerio.load(html);
if ($('.error404').text()) { if ($('.error404').text()) {
console.log(`>>>>>>>>>>>> ${url} not found`); console.log(`>>>>>>>>>>>> ${url} not found`);
@ -60,22 +90,10 @@ const parseOnePage = async ({subLink, category, sortIdx}) => {
record.sortIdx = sortIdx; record.sortIdx = sortIdx;
} }
await record.save(); await record.save();
await CrawlRecord.updateRecord({url: url,
className: 'dandanzan',
methodName: 'parseOnePage',
params: {subLink, category, sortIdx},
lastStatus: true,
})
console.log(`@@@@@ ${sortIdx}: ${subLink} @ ${record.name} saved`); console.log(`@@@@@ ${sortIdx}: ${subLink} @ ${record.name} saved`);
} }
} catch (err) { } catch (err) {
console.log(err); console.log(err);
await CrawlRecord.updateRecord({url: url,
className: 'dandanzan',
methodName: 'parseOnePage',
params: {subLink, category, sortIdx},
lastStatus: false,
})
} }
} }
@ -109,13 +127,23 @@ const parseListPage = async ({idx, category}) => {
lastStatus: true, lastStatus: true,
}) })
} catch (err) { } catch (err) {
console.log(err); console.log('parse page with network error, try again :', url);
await CrawlRecord.updateRecord({url: url, try {
className: 'dandanzan', html = await proxyUtil.getDataProxy(url)
methodName: 'parseListPage', await CrawlRecord.updateRecord({url: url,
params: {idx, category}, className: 'dandanzan',
lastStatus: false, methodName: 'parseListPage',
}) params: {idx, category},
lastStatus: true,
})
} catch (err2) {
await CrawlRecord.updateRecord({url: url,
className: 'dandanzan',
methodName: 'parseListPage',
params: {idx, category},
lastStatus: false,
})
}
} }
if (html) { if (html) {
const $ = cheerio.load(html); const $ = cheerio.load(html);
@ -142,17 +170,6 @@ const parseListPage = async ({idx, category}) => {
} else { } else {
return 1; return 1;
} }
// if ($('.next-page')) {
// let nextStr = $('.next-page a').attr('href');
// console.log('has next page: ', nextStr);
// try {
// await parseListPage(nextStr, category);
// } catch (err) {
// console.log(err);
// }
// } else {
// console.log('########################### ALL LIST PAGE END ###########################');
// }
} else { } else {
return 1; return 1;
} }