From 645aef2c70ed07462132ac70da1f8ace3175dac3 Mon Sep 17 00:00:00 2001 From: zhl Date: Thu, 9 May 2019 11:25:13 +0800 Subject: [PATCH 1/2] =?UTF-8?q?=E4=BF=AE=E6=AD=A3=E6=8A=93=E5=8F=96?= =?UTF-8?q?=E8=AE=B0=E5=BD=95=E7=8A=B6=E6=80=81=E8=AE=BE=E7=BD=AE=E9=94=99?= =?UTF-8?q?=E8=AF=AF=E7=9A=84bug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/sites/dandanzan.js | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/sites/dandanzan.js b/src/sites/dandanzan.js index 57ca508..acd7ba3 100644 --- a/src/sites/dandanzan.js +++ b/src/sites/dandanzan.js @@ -64,7 +64,7 @@ const parseOnePage = async ({subLink, category, sortIdx}) => { className: 'dandanzan', methodName: 'parseOnePage', params: {subLink, category, sortIdx}, - statusCode: true, + lastStatus: true, }) console.log(`@@@@@ ${sortIdx}: ${subLink} @ ${record.name} saved`); } @@ -74,7 +74,7 @@ const parseOnePage = async ({subLink, category, sortIdx}) => { className: 'dandanzan', methodName: 'parseOnePage', params: {subLink, category, sortIdx}, - statusCode: false, + lastStatus: false, }) } } @@ -106,7 +106,7 @@ const parseListPage = async ({idx, category}) => { className: 'dandanzan', methodName: 'parseListPage', params: {idx, category}, - statusCode: true, + lastStatus: true, }) } catch (err) { console.log(err); @@ -114,7 +114,7 @@ const parseListPage = async ({idx, category}) => { className: 'dandanzan', methodName: 'parseListPage', params: {idx, category}, - statusCode: false, + lastStatus: false, }) } if (html) { From d9f106d9ef3ff099e0afffdbe11485e834f24f20 Mon Sep 17 00:00:00 2001 From: zhl Date: Thu, 9 May 2019 11:32:15 +0800 Subject: [PATCH 2/2] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E6=95=B0=E6=8D=AE?= =?UTF-8?q?=E8=AF=B7=E6=B1=82=E7=9A=84=E9=87=8D=E8=AF=95=E6=9C=BA=E5=88=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/sites/dandanzan.js | 79 +++++++++++++++++++++++++----------------- 1 file changed, 48 insertions(+), 31 deletions(-) diff --git a/src/sites/dandanzan.js b/src/sites/dandanzan.js index acd7ba3..811f699 100644 --- a/src/sites/dandanzan.js +++ b/src/sites/dandanzan.js @@ -15,8 +15,38 @@ const maxIdx = 100000; * */ const parseOnePage = async ({subLink, category, sortIdx}) => { const url = `${URL_BASE}${subLink}` + let html; + try { + html = await proxyUtil.getDataProxy(url) + await CrawlRecord.updateRecord({url: url, + className: 'dandanzan', + methodName: 'parseOnePage', + params: {subLink, category, sortIdx}, + lastStatus: true, + }) + } catch(err) { + console.log('parse page with network error, try again :', url); + try { + html = await proxyUtil.getDataProxy(url) + await CrawlRecord.updateRecord({url: url, + className: 'dandanzan', + methodName: 'parseOnePage', + params: {subLink, category, sortIdx}, + lastStatus: true, + }) + } catch (err2) { + await CrawlRecord.updateRecord({url: url, + className: 'dandanzan', + methodName: 'parseOnePage', + params: {subLink, category, sortIdx}, + lastStatus: false, + }) + } + } + if (!html) { + return false; + } try { - let html = await proxyUtil.getDataProxy(url) const $ = cheerio.load(html); if ($('.error404').text()) { console.log(`>>>>>>>>>>>> ${url} not found`); @@ -60,22 +90,10 @@ const parseOnePage = async ({subLink, category, sortIdx}) => { record.sortIdx = sortIdx; } await record.save(); - await CrawlRecord.updateRecord({url: url, - className: 'dandanzan', - methodName: 'parseOnePage', - params: {subLink, category, sortIdx}, - lastStatus: true, - }) console.log(`@@@@@ ${sortIdx}: ${subLink} @ ${record.name} saved`); } } catch (err) { console.log(err); - await CrawlRecord.updateRecord({url: url, - className: 'dandanzan', - methodName: 'parseOnePage', - params: {subLink, category, sortIdx}, - lastStatus: false, - }) } } @@ -109,13 +127,23 @@ const parseListPage = async ({idx, category}) => { lastStatus: true, }) } catch (err) { - console.log(err); - await CrawlRecord.updateRecord({url: url, - className: 'dandanzan', - methodName: 'parseListPage', - params: {idx, category}, - lastStatus: false, - }) + console.log('parse page with network error, try again :', url); + try { + html = await proxyUtil.getDataProxy(url) + await CrawlRecord.updateRecord({url: url, + className: 'dandanzan', + methodName: 'parseListPage', + params: {idx, category}, + lastStatus: true, + }) + } catch (err2) { + await CrawlRecord.updateRecord({url: url, + className: 'dandanzan', + methodName: 'parseListPage', + params: {idx, category}, + lastStatus: false, + }) + } } if (html) { const $ = cheerio.load(html); @@ -142,17 +170,6 @@ const parseListPage = async ({idx, category}) => { } else { return 1; } - // if ($('.next-page')) { - // let nextStr = $('.next-page a').attr('href'); - // console.log('has next page: ', nextStr); - // try { - // await parseListPage(nextStr, category); - // } catch (err) { - // console.log(err); - // } - // } else { - // console.log('########################### ALL LIST PAGE END ###########################'); - // } } else { return 1; }