From c2add8ff008a553027259e209f901f5b1c8c6686 Mon Sep 17 00:00:00 2001 From: zhl Date: Wed, 8 May 2019 21:03:24 +0800 Subject: [PATCH] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E5=85=8D=E8=B4=B9=E4=BB=A3?= =?UTF-8?q?=E7=90=86=E7=9A=84=E6=8A=93=E5=8F=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- package.json | 2 + src/app.js | 5 +- src/models/spider/ProxyInfo.js | 46 +++++++++++ src/sites/dandanzan.js | 9 +-- src/sites/proxy.js | 144 +++++++++++++++++++++++++++++++++ src/utils/general.queue.js | 5 ++ src/utils/net.util.js | 52 ++++++++++-- src/utils/proxy.util.js | 30 +++++++ src/utils/proxys.js | 14 ---- stat | 1 + 10 files changed, 279 insertions(+), 29 deletions(-) create mode 100644 src/models/spider/ProxyInfo.js create mode 100644 src/sites/proxy.js create mode 100644 src/utils/proxy.util.js delete mode 100644 src/utils/proxys.js create mode 100644 stat diff --git a/package.json b/package.json index f56f617..1f87311 100644 --- a/package.json +++ b/package.json @@ -34,6 +34,8 @@ "request": "^2.88.0", "request-promise": "^4.2.4", "socks5-http-client": "^1.0.4", + "superagent": "^5.0.5", + "superagent-proxy": "^2.0.0", "ws": "^6.1.2" }, "devDependencies": { diff --git a/src/app.js b/src/app.js index 3dee0f4..bfbdbef 100644 --- a/src/app.js +++ b/src/app.js @@ -8,7 +8,7 @@ import movie from './sites/movie'; import book from './sites/book'; import bookChapter from './sites/bookChapter'; import dandanzan from './sites/dandanzan'; - +import proxy from './sites/proxy'; mongoose.Promise = Promise; @@ -24,7 +24,8 @@ db.once('open', function () { // book.run(); // movie.run(); // bookChapter.run(); - dandanzan.run(); + // dandanzan.run(); + proxy.run(); }); mongoose.connect(config.db, {promiseLibrary: Promise, useNewUrlParser: true}); diff --git a/src/models/spider/ProxyInfo.js b/src/models/spider/ProxyInfo.js new file mode 100644 index 0000000..bb3bc52 --- /dev/null +++ b/src/models/spider/ProxyInfo.js @@ -0,0 +1,46 @@ +import mongoose from 'mongoose'; +import stringUtil from '../../utils/string.util'; + +let Schema = mongoose.Schema; + + +let ProxyInfoSchema = new Schema({ + //http, socket4, socket5 + type: {type: String}, + link: {type: String}, + info: {type: String}, + last_check: {type: Date}, + //相应时间 + response_time: {type: Number}, + // 1: 存活, 0: 未知,-1: 不可用 + status: {type: Number, default: 0}, + // 连续尝试次数 + try_count: {type: Number, default: 0}, +}, { + collection: 'proxy_info', + timestamps: true +}); + + +let ProxyInfoModel = mongoose.model('ProxyInfo', ProxyInfoSchema); +ProxyInfoModel.updateOne = async function (link, record) { + const query = {link: link}; + const options = {upsert: true, setDefaultsOnInsert:true}; + await ProxyInfoModel.update(query, record, options); +} + +// 返回所有待检查的代理 +ProxyInfoModel.needCheckList = function() { + return ProxyInfoModel.find({status: {$in: [0 ,1]}}); +} + +// 所有可用代理 +ProxyInfoModel.availableList = function() { + return ProxyInfoModel.find({status: 1}); +} + +ProxyInfoModel.randomProxy = function () { + return ProxyInfoModel.find({status: 1}).limit(1); +} + +export default ProxyInfoModel; diff --git a/src/sites/dandanzan.js b/src/sites/dandanzan.js index f659aa2..d0240af 100644 --- a/src/sites/dandanzan.js +++ b/src/sites/dandanzan.js @@ -145,12 +145,9 @@ const parseAllMovie = async (category) => { export default { run: async () => { - // await parseAllMovie('movie'); - // await parseAllMovie('tv'); - // await parseAllMovie('show'); + await parseAllMovie('movie'); + await parseAllMovie('tv'); + await parseAllMovie('show'); await parseAllMovie('cartoon'); - // console.log('all done'); - // let html = await netUtil.getData('https://wechat-test.kingsome.cn/', {}) - // console.log(html); } } diff --git a/src/sites/proxy.js b/src/sites/proxy.js new file mode 100644 index 0000000..16f7cbe --- /dev/null +++ b/src/sites/proxy.js @@ -0,0 +1,144 @@ +import ProxyInfo from '../models/spider/ProxyInfo'; +import netUtil from "../utils/net.util"; +import cheerio from "cheerio"; +import generalQueue from '../utils/general.queue'; +import proxyUtil from '../utils/proxy.util'; + +/* * + * 获取www.xiladaili.com的代理数据 + * http://www.xiladaili.com/https/1/ + * */ +const parseOneXiladaili = async (idx) => { + let url = `http://www.xiladaili.com/https/${idx + 1}/`; + console.log('begin parse page:', url); + try { + let html = await netUtil.getData(url, {}) + if (html) { + const $ = cheerio.load(html); + const trArr = $('.fl-table tbody tr'); + $(trArr).each(async function (i, tr) { + const link = $(this).find('td').first().text(); + const record = { + link: link, + type: 'http', + info: $(this).find('td').eq(3).text() + } + await ProxyInfo.updateOne(link, record) + }); + } + } catch (err) { + console.log(err); + } +} +/* * + * 获取www.xicidaili.com的代理数据 + * https://www.xicidaili.com/nn/1 + * */ +const parseOneXicidaili = async (idx) => { + let url = `https://www.xicidaili.com/nn/${idx + 1}`; + console.log('begin parse page:', url); + try { + let html = await netUtil.getData(url, {}) + if (html) { + const $ = cheerio.load(html); + const trs = $("#ip_list tr"); + for (let i = 1; i < trs.length; i++) { + const tds = trs.eq(i).children("td"); + const link = `${tds.eq(1).text()}:${tds.eq(2).text()}` + const type = tds.eq(5).text(); + const info = tds.eq(3).text(); + let speed = tds.eq(6).children("div").attr("title"); + speed = speed.substring(0, speed.length - 1); + let connectTime = tds.eq(7).children("div").attr("title"); + connectTime = connectTime.substring(0, connectTime.length - 1); + if (speed <= 5 && connectTime <= 1 && type === 'HTTPS') { //用速度和连接时间筛选一轮 + const record = { + link: link, + type: 'http', + info: info + } + await ProxyInfo.updateOne(link, record) + } + } + } + } catch (err) { + console.log(err); + } +} + +const checkAndUpdate = async (record) => { + console.log('begin check:', record.link); + try { + let time = await proxyUtil.checkProxy(`http://${record.link}`); + record.status = 1; + record.response_time = time; + } catch (err) { + console.log('not available:', record.link); + record.status = -1; + } + record.try_count += 1; + record.last_check = new Date(); + try { + await record.save(); + } catch (err) { + console.log('error update proxy record', record.link); + } + return record.status > 0; +} + +const parseXiladaili = async (maxPage) => { + for (let i = 0; i < maxPage; i++) { + try { + await parseOneXiladaili(i); + } catch (err) { + console.log(err); + } + } + console.log('finish parse all page, Xiladaili'); +} +const parseXicidaili = async (maxPage) => { + for (let i = 0; i < maxPage; i++) { + try { + await parseOneXicidaili(i); + } catch (err) { + console.log(err); + } + } + console.log('finish parse all page, Xicidaili'); +} +const checkAllProxy = async () => { + let records; + return new Promise(async (resolve, reject) => { + console.time('all') + generalQueue.setCb(function () { + console.log('checkAllProxy finished'); + resolve() + }) + try { + records = await ProxyInfo.needCheckList(); + } catch (err) { + reject(err); + } + for(let record of records) { + try { + generalQueue.addQueue({ + run: async function () { + await checkAndUpdate(record); + } + }) + } catch (err) {} + } + }) + +} +export default { + run: async () => { + try { + // await parseXiladaili(20); + // await parseXicidaili(20); + await checkAllProxy(); + } catch (err) { + console.log(err); + } + } +} diff --git a/src/utils/general.queue.js b/src/utils/general.queue.js index 2102665..c0b94f6 100644 --- a/src/utils/general.queue.js +++ b/src/utils/general.queue.js @@ -1,5 +1,6 @@ import async from 'async'; +let finishCb; /** * 操作队列 * */ @@ -14,6 +15,7 @@ let q = async.queue( async (reqObj, cb) => { q.drain = function(){ console.info('all queue done'); console.timeEnd('all'); + (finishCb) && (finishCb()); }; module.exports = { addQueue(obj) { @@ -22,5 +24,8 @@ module.exports = { console.log('error parse: ', obj, err); } }); + }, + setCb(cb) { + finishCb = cb; } } diff --git a/src/utils/net.util.js b/src/utils/net.util.js index 6ba968c..451d8de 100644 --- a/src/utils/net.util.js +++ b/src/utils/net.util.js @@ -1,8 +1,6 @@ import request from 'request'; import Promise from 'bluebird'; -import proxy from './proxys'; import random_useragent from 'random-useragent'; -let agent = require('socks5-http-client/lib/Agent') const iconv = require('iconv-lite'); @@ -55,6 +53,7 @@ export default { (gzip) && (options.gzip = true); return requestData(options, encoding); }, + // 通过代理来get数据 getDataProxy(url, header, encoding, gzip) { header = header || { 'Cache-Control': 'no-cache', @@ -65,16 +64,55 @@ export default { url: url, headers: header, proxy: proxy.randomProxy(), - // agentClass: agent, - // agentOptions: { - // socksHost: '101.71.41.169', - // socksPort: 43, - // } }; if (encoding) { options.encoding = null; } (gzip) && (options.gzip = true); return requestData(options, encoding); + }, + // 检查proxy是否可用 + async checkProxy(link){ + let header = { + 'Cache-Control': 'no-cache', + 'User-Agent': random_useragent.getRandom(), + 'Connection':'close', + } + //https://wechat-test.kingsome.cn/api/stat + //https://www.baidu.com/favicon.ico + const options = { + url: 'https://wechat-test.kingsome.cn/api/stat', + proxy: link, + method: 'GET', + headers: header, + time : true, + followRedirect: false, + // pool: false, + } + return new Promise((resolve, reject) => { + let req = request(options, (err, response, body) => { + if (err) { + return reject(err); + } + console.log('status code:', response.statusCode, 'body:', body); + if (response.statusCode === 200 && body === '1') { + console.log('Request time in ms', response.elapsedTime); + resolve(response.elapsedTime); + } else { + reject(new Error(' server response code: ' + response.statusCode + ' with url: ' + options.url)); + } + }); + setTimeout(() => { + console.log('check proxy timeout', link); + try { + req.abort(new Error('timeout')); + } catch (err) { + console.log(err); + reject() + } + reject() + }, 5000); + }) + } } diff --git a/src/utils/proxy.util.js b/src/utils/proxy.util.js new file mode 100644 index 0000000..e1cbb5c --- /dev/null +++ b/src/utils/proxy.util.js @@ -0,0 +1,30 @@ +import stringUtil from './string.util'; +import ProxyInfo from '../models/spider/ProxyInfo'; + +let request = require('superagent'); +require('superagent-proxy')(request); + +let proxys = []; + +export default { + async randomProxy() { + if (proxys.length === 0) { + proxys = await ProxyInfo.availableList(); + } + return proxys[stringUtil.randomNum(0, proxys.length - 1)]; + }, + async checkProxy(proxy){ + return new Promise(async (resolve, reject) => { + try { + let response = await request.get('https://wechat-test.kingsome.cn/api/stat').proxy(proxy).timeout(5000); + if(response.statusCode === 200 && response.text === '1' ){ + resolve(); + } else { + reject(new Error(' server response code: ' + response.statusCode)); + } + } catch (err) { + reject(err); + } + }) + } +} diff --git a/src/utils/proxys.js b/src/utils/proxys.js deleted file mode 100644 index 0383fde..0000000 --- a/src/utils/proxys.js +++ /dev/null @@ -1,14 +0,0 @@ -import stringUtil from './string.util'; -const proxys = [ - 'http://113.200.56.13:8010', - 'http://65.52.174.40:80', - 'http://165.22.254.199:8080', - 'http://88.255.101.241:8080', - 'http://117.197.117.50:8080' - ]; -export default { - - randomProxy() { - return proxys[stringUtil.randomNum(0, proxys.length - 1)]; - } -} diff --git a/stat b/stat new file mode 100644 index 0000000..56a6051 --- /dev/null +++ b/stat @@ -0,0 +1 @@ +1 \ No newline at end of file