diff --git a/src/sites/dandanzan.js b/src/sites/dandanzan.js index 57ca508..811f699 100644 --- a/src/sites/dandanzan.js +++ b/src/sites/dandanzan.js @@ -15,8 +15,38 @@ const maxIdx = 100000; * */ const parseOnePage = async ({subLink, category, sortIdx}) => { const url = `${URL_BASE}${subLink}` + let html; + try { + html = await proxyUtil.getDataProxy(url) + await CrawlRecord.updateRecord({url: url, + className: 'dandanzan', + methodName: 'parseOnePage', + params: {subLink, category, sortIdx}, + lastStatus: true, + }) + } catch(err) { + console.log('parse page with network error, try again :', url); + try { + html = await proxyUtil.getDataProxy(url) + await CrawlRecord.updateRecord({url: url, + className: 'dandanzan', + methodName: 'parseOnePage', + params: {subLink, category, sortIdx}, + lastStatus: true, + }) + } catch (err2) { + await CrawlRecord.updateRecord({url: url, + className: 'dandanzan', + methodName: 'parseOnePage', + params: {subLink, category, sortIdx}, + lastStatus: false, + }) + } + } + if (!html) { + return false; + } try { - let html = await proxyUtil.getDataProxy(url) const $ = cheerio.load(html); if ($('.error404').text()) { console.log(`>>>>>>>>>>>> ${url} not found`); @@ -60,22 +90,10 @@ const parseOnePage = async ({subLink, category, sortIdx}) => { record.sortIdx = sortIdx; } await record.save(); - await CrawlRecord.updateRecord({url: url, - className: 'dandanzan', - methodName: 'parseOnePage', - params: {subLink, category, sortIdx}, - statusCode: true, - }) console.log(`@@@@@ ${sortIdx}: ${subLink} @ ${record.name} saved`); } } catch (err) { console.log(err); - await CrawlRecord.updateRecord({url: url, - className: 'dandanzan', - methodName: 'parseOnePage', - params: {subLink, category, sortIdx}, - statusCode: false, - }) } } @@ -106,16 +124,26 @@ const parseListPage = async ({idx, category}) => { className: 'dandanzan', methodName: 'parseListPage', params: {idx, category}, - statusCode: true, + lastStatus: true, }) } catch (err) { - console.log(err); - await CrawlRecord.updateRecord({url: url, - className: 'dandanzan', - methodName: 'parseListPage', - params: {idx, category}, - statusCode: false, - }) + console.log('parse page with network error, try again :', url); + try { + html = await proxyUtil.getDataProxy(url) + await CrawlRecord.updateRecord({url: url, + className: 'dandanzan', + methodName: 'parseListPage', + params: {idx, category}, + lastStatus: true, + }) + } catch (err2) { + await CrawlRecord.updateRecord({url: url, + className: 'dandanzan', + methodName: 'parseListPage', + params: {idx, category}, + lastStatus: false, + }) + } } if (html) { const $ = cheerio.load(html); @@ -142,17 +170,6 @@ const parseListPage = async ({idx, category}) => { } else { return 1; } - // if ($('.next-page')) { - // let nextStr = $('.next-page a').attr('href'); - // console.log('has next page: ', nextStr); - // try { - // await parseListPage(nextStr, category); - // } catch (err) { - // console.log(err); - // } - // } else { - // console.log('########################### ALL LIST PAGE END ###########################'); - // } } else { return 1; }