增加免费代理的抓取

This commit is contained in:
zhl 2019-05-08 21:03:24 +08:00
parent 20492021ea
commit c2add8ff00
10 changed files with 279 additions and 29 deletions

View File

@ -34,6 +34,8 @@
"request": "^2.88.0",
"request-promise": "^4.2.4",
"socks5-http-client": "^1.0.4",
"superagent": "^5.0.5",
"superagent-proxy": "^2.0.0",
"ws": "^6.1.2"
},
"devDependencies": {

View File

@ -8,7 +8,7 @@ import movie from './sites/movie';
import book from './sites/book';
import bookChapter from './sites/bookChapter';
import dandanzan from './sites/dandanzan';
import proxy from './sites/proxy';
mongoose.Promise = Promise;
@ -24,7 +24,8 @@ db.once('open', function () {
// book.run();
// movie.run();
// bookChapter.run();
dandanzan.run();
// dandanzan.run();
proxy.run();
});
mongoose.connect(config.db, {promiseLibrary: Promise, useNewUrlParser: true});

View File

@ -0,0 +1,46 @@
import mongoose from 'mongoose';
import stringUtil from '../../utils/string.util';
let Schema = mongoose.Schema;
let ProxyInfoSchema = new Schema({
//http, socket4, socket5
type: {type: String},
link: {type: String},
info: {type: String},
last_check: {type: Date},
//相应时间
response_time: {type: Number},
// 1: 存活, 0: 未知,-1: 不可用
status: {type: Number, default: 0},
// 连续尝试次数
try_count: {type: Number, default: 0},
}, {
collection: 'proxy_info',
timestamps: true
});
let ProxyInfoModel = mongoose.model('ProxyInfo', ProxyInfoSchema);
ProxyInfoModel.updateOne = async function (link, record) {
const query = {link: link};
const options = {upsert: true, setDefaultsOnInsert:true};
await ProxyInfoModel.update(query, record, options);
}
// 返回所有待检查的代理
ProxyInfoModel.needCheckList = function() {
return ProxyInfoModel.find({status: {$in: [0 ,1]}});
}
// 所有可用代理
ProxyInfoModel.availableList = function() {
return ProxyInfoModel.find({status: 1});
}
ProxyInfoModel.randomProxy = function () {
return ProxyInfoModel.find({status: 1}).limit(1);
}
export default ProxyInfoModel;

View File

@ -145,12 +145,9 @@ const parseAllMovie = async (category) => {
export default {
run: async () => {
// await parseAllMovie('movie');
// await parseAllMovie('tv');
// await parseAllMovie('show');
await parseAllMovie('movie');
await parseAllMovie('tv');
await parseAllMovie('show');
await parseAllMovie('cartoon');
// console.log('all done');
// let html = await netUtil.getData('https://wechat-test.kingsome.cn/', {})
// console.log(html);
}
}

144
src/sites/proxy.js Normal file
View File

@ -0,0 +1,144 @@
import ProxyInfo from '../models/spider/ProxyInfo';
import netUtil from "../utils/net.util";
import cheerio from "cheerio";
import generalQueue from '../utils/general.queue';
import proxyUtil from '../utils/proxy.util';
/* *
* 获取www.xiladaili.com的代理数据
* http://www.xiladaili.com/https/1/
* */
const parseOneXiladaili = async (idx) => {
let url = `http://www.xiladaili.com/https/${idx + 1}/`;
console.log('begin parse page:', url);
try {
let html = await netUtil.getData(url, {})
if (html) {
const $ = cheerio.load(html);
const trArr = $('.fl-table tbody tr');
$(trArr).each(async function (i, tr) {
const link = $(this).find('td').first().text();
const record = {
link: link,
type: 'http',
info: $(this).find('td').eq(3).text()
}
await ProxyInfo.updateOne(link, record)
});
}
} catch (err) {
console.log(err);
}
}
/* *
* 获取www.xicidaili.com的代理数据
* https://www.xicidaili.com/nn/1
* */
const parseOneXicidaili = async (idx) => {
let url = `https://www.xicidaili.com/nn/${idx + 1}`;
console.log('begin parse page:', url);
try {
let html = await netUtil.getData(url, {})
if (html) {
const $ = cheerio.load(html);
const trs = $("#ip_list tr");
for (let i = 1; i < trs.length; i++) {
const tds = trs.eq(i).children("td");
const link = `${tds.eq(1).text()}:${tds.eq(2).text()}`
const type = tds.eq(5).text();
const info = tds.eq(3).text();
let speed = tds.eq(6).children("div").attr("title");
speed = speed.substring(0, speed.length - 1);
let connectTime = tds.eq(7).children("div").attr("title");
connectTime = connectTime.substring(0, connectTime.length - 1);
if (speed <= 5 && connectTime <= 1 && type === 'HTTPS') { //用速度和连接时间筛选一轮
const record = {
link: link,
type: 'http',
info: info
}
await ProxyInfo.updateOne(link, record)
}
}
}
} catch (err) {
console.log(err);
}
}
const checkAndUpdate = async (record) => {
console.log('begin check:', record.link);
try {
let time = await proxyUtil.checkProxy(`http://${record.link}`);
record.status = 1;
record.response_time = time;
} catch (err) {
console.log('not available:', record.link);
record.status = -1;
}
record.try_count += 1;
record.last_check = new Date();
try {
await record.save();
} catch (err) {
console.log('error update proxy record', record.link);
}
return record.status > 0;
}
const parseXiladaili = async (maxPage) => {
for (let i = 0; i < maxPage; i++) {
try {
await parseOneXiladaili(i);
} catch (err) {
console.log(err);
}
}
console.log('finish parse all page, Xiladaili');
}
const parseXicidaili = async (maxPage) => {
for (let i = 0; i < maxPage; i++) {
try {
await parseOneXicidaili(i);
} catch (err) {
console.log(err);
}
}
console.log('finish parse all page, Xicidaili');
}
const checkAllProxy = async () => {
let records;
return new Promise(async (resolve, reject) => {
console.time('all')
generalQueue.setCb(function () {
console.log('checkAllProxy finished');
resolve()
})
try {
records = await ProxyInfo.needCheckList();
} catch (err) {
reject(err);
}
for(let record of records) {
try {
generalQueue.addQueue({
run: async function () {
await checkAndUpdate(record);
}
})
} catch (err) {}
}
})
}
export default {
run: async () => {
try {
// await parseXiladaili(20);
// await parseXicidaili(20);
await checkAllProxy();
} catch (err) {
console.log(err);
}
}
}

View File

@ -1,5 +1,6 @@
import async from 'async';
let finishCb;
/**
* 操作队列
* */
@ -14,6 +15,7 @@ let q = async.queue( async (reqObj, cb) => {
q.drain = function(){
console.info('all queue done');
console.timeEnd('all');
(finishCb) && (finishCb());
};
module.exports = {
addQueue(obj) {
@ -22,5 +24,8 @@ module.exports = {
console.log('error parse: ', obj, err);
}
});
},
setCb(cb) {
finishCb = cb;
}
}

View File

@ -1,8 +1,6 @@
import request from 'request';
import Promise from 'bluebird';
import proxy from './proxys';
import random_useragent from 'random-useragent';
let agent = require('socks5-http-client/lib/Agent')
const iconv = require('iconv-lite');
@ -55,6 +53,7 @@ export default {
(gzip) && (options.gzip = true);
return requestData(options, encoding);
},
// 通过代理来get数据
getDataProxy(url, header, encoding, gzip) {
header = header || {
'Cache-Control': 'no-cache',
@ -65,16 +64,55 @@ export default {
url: url,
headers: header,
proxy: proxy.randomProxy(),
// agentClass: agent,
// agentOptions: {
// socksHost: '101.71.41.169',
// socksPort: 43,
// }
};
if (encoding) {
options.encoding = null;
}
(gzip) && (options.gzip = true);
return requestData(options, encoding);
},
// 检查proxy是否可用
async checkProxy(link){
let header = {
'Cache-Control': 'no-cache',
'User-Agent': random_useragent.getRandom(),
'Connection':'close',
}
//https://wechat-test.kingsome.cn/api/stat
//https://www.baidu.com/favicon.ico
const options = {
url: 'https://wechat-test.kingsome.cn/api/stat',
proxy: link,
method: 'GET',
headers: header,
time : true,
followRedirect: false,
// pool: false,
}
return new Promise((resolve, reject) => {
let req = request(options, (err, response, body) => {
if (err) {
return reject(err);
}
console.log('status code:', response.statusCode, 'body:', body);
if (response.statusCode === 200 && body === '1') {
console.log('Request time in ms', response.elapsedTime);
resolve(response.elapsedTime);
} else {
reject(new Error(' server response code: ' + response.statusCode + ' with url: ' + options.url));
}
});
setTimeout(() => {
console.log('check proxy timeout', link);
try {
req.abort(new Error('timeout'));
} catch (err) {
console.log(err);
reject()
}
reject()
}, 5000);
})
}
}

30
src/utils/proxy.util.js Normal file
View File

@ -0,0 +1,30 @@
import stringUtil from './string.util';
import ProxyInfo from '../models/spider/ProxyInfo';
let request = require('superagent');
require('superagent-proxy')(request);
let proxys = [];
export default {
async randomProxy() {
if (proxys.length === 0) {
proxys = await ProxyInfo.availableList();
}
return proxys[stringUtil.randomNum(0, proxys.length - 1)];
},
async checkProxy(proxy){
return new Promise(async (resolve, reject) => {
try {
let response = await request.get('https://wechat-test.kingsome.cn/api/stat').proxy(proxy).timeout(5000);
if(response.statusCode === 200 && response.text === '1' ){
resolve();
} else {
reject(new Error(' server response code: ' + response.statusCode));
}
} catch (err) {
reject(err);
}
})
}
}

View File

@ -1,14 +0,0 @@
import stringUtil from './string.util';
const proxys = [
'http://113.200.56.13:8010',
'http://65.52.174.40:80',
'http://165.22.254.199:8080',
'http://88.255.101.241:8080',
'http://117.197.117.50:8080'
];
export default {
randomProxy() {
return proxys[stringUtil.randomNum(0, proxys.length - 1)];
}
}

1
stat Normal file
View File

@ -0,0 +1 @@
1