From bbaa44f8c13a238f0a06e96784ce284b6019b8b8 Mon Sep 17 00:00:00 2001 From: zhl Date: Wed, 8 May 2019 21:34:24 +0800 Subject: [PATCH] =?UTF-8?q?=E8=9B=8B=E8=9B=8B=E8=B5=9E=E7=94=B5=E5=BD=B1?= =?UTF-8?q?=E9=80=9A=E8=BF=87superagent=E8=AF=B7=E6=B1=82=E6=95=B0?= =?UTF-8?q?=E7=BB=84?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/sites/dandanzan.js | 13 +++++++------ src/utils/net.util.js | 1 + src/utils/proxy.util.js | 29 +++++++++++++++++++++++++++-- 3 files changed, 35 insertions(+), 8 deletions(-) diff --git a/src/sites/dandanzan.js b/src/sites/dandanzan.js index c72a457..1180303 100644 --- a/src/sites/dandanzan.js +++ b/src/sites/dandanzan.js @@ -4,6 +4,7 @@ import stringUtil from '../utils/string.util'; import Movie from '../models/Movies'; import generalQueue from '../utils/general.queue'; import proxy from './proxy'; +import proxyUtil from '../utils/proxy.util'; const URL_BASE = 'https://www.dandanzan.com' const maxIdx = 100000; @@ -16,7 +17,7 @@ const maxIdx = 100000; const parseOnePage = async (subLink, category, sortIdx) => { const url = `${URL_BASE}${subLink}` try { - let html = await netUtil.getDataProxy(url, {}) + let html = await proxyUtil.getDataProxy(url) const $ = cheerio.load(html); if ($('.error404').text()) { console.log(`>>>>>>>>>>>> ${url} not found`); @@ -89,7 +90,7 @@ const parseListPage = async (idx, category) => { console.log(`begin parse category: ${category} page: ${subPage}`); let html; try { - html = await netUtil.getDataProxy(url, {}) + html = await proxyUtil.getDataProxy(url) } catch (err) { console.log(err); } @@ -146,10 +147,10 @@ const parseAllMovie = async (category) => { export default { run: async () => { - await proxy.run(); - await parseAllMovie('movie'); - await parseAllMovie('tv'); - await parseAllMovie('show'); + // await proxy.run(); + // await parseAllMovie('movie'); + // await parseAllMovie('tv'); + // await parseAllMovie('show'); await parseAllMovie('cartoon'); } } diff --git a/src/utils/net.util.js b/src/utils/net.util.js index 451d8de..3d91662 100644 --- a/src/utils/net.util.js +++ b/src/utils/net.util.js @@ -1,6 +1,7 @@ import request from 'request'; import Promise from 'bluebird'; import random_useragent from 'random-useragent'; +import proxy from './proxy.util'; const iconv = require('iconv-lite'); diff --git a/src/utils/proxy.util.js b/src/utils/proxy.util.js index e1cbb5c..4ea2c4f 100644 --- a/src/utils/proxy.util.js +++ b/src/utils/proxy.util.js @@ -1,5 +1,7 @@ import stringUtil from './string.util'; import ProxyInfo from '../models/spider/ProxyInfo'; +import Promise from 'bluebird'; +import random_useragent from 'random-useragent'; let request = require('superagent'); require('superagent-proxy')(request); @@ -11,7 +13,7 @@ export default { if (proxys.length === 0) { proxys = await ProxyInfo.availableList(); } - return proxys[stringUtil.randomNum(0, proxys.length - 1)]; + return proxys[stringUtil.randomNum(0, proxys.length - 1)].link; }, async checkProxy(proxy){ return new Promise(async (resolve, reject) => { @@ -26,5 +28,28 @@ export default { reject(err); } }) - } + }, + // 通过代理来get数据 + async getDataProxy(url) { + if (proxys.length === 0) { + proxys = await ProxyInfo.availableList(); + } + const proxy = 'http://' + proxys[stringUtil.randomNum(0, proxys.length - 1)].link; + return new Promise(async (resolve, reject) => { + try { + let response = await request.get(url) + .set('User-Agent', random_useragent.getRandom()) + .proxy(proxy) + .retry(2) + .timeout(15000); + if(response.statusCode === 200 ){ + resolve(response.text); + } else { + reject(new Error(' server response code: ' + response.statusCode)); + } + } catch (err) { + reject(err); + } + }) + }, }