增加免费代理的抓取
This commit is contained in:
parent
20492021ea
commit
c2add8ff00
@ -34,6 +34,8 @@
|
||||
"request": "^2.88.0",
|
||||
"request-promise": "^4.2.4",
|
||||
"socks5-http-client": "^1.0.4",
|
||||
"superagent": "^5.0.5",
|
||||
"superagent-proxy": "^2.0.0",
|
||||
"ws": "^6.1.2"
|
||||
},
|
||||
"devDependencies": {
|
||||
|
@ -8,7 +8,7 @@ import movie from './sites/movie';
|
||||
import book from './sites/book';
|
||||
import bookChapter from './sites/bookChapter';
|
||||
import dandanzan from './sites/dandanzan';
|
||||
|
||||
import proxy from './sites/proxy';
|
||||
mongoose.Promise = Promise;
|
||||
|
||||
|
||||
@ -24,7 +24,8 @@ db.once('open', function () {
|
||||
// book.run();
|
||||
// movie.run();
|
||||
// bookChapter.run();
|
||||
dandanzan.run();
|
||||
// dandanzan.run();
|
||||
proxy.run();
|
||||
});
|
||||
mongoose.connect(config.db, {promiseLibrary: Promise, useNewUrlParser: true});
|
||||
|
||||
|
46
src/models/spider/ProxyInfo.js
Normal file
46
src/models/spider/ProxyInfo.js
Normal file
@ -0,0 +1,46 @@
|
||||
import mongoose from 'mongoose';
|
||||
import stringUtil from '../../utils/string.util';
|
||||
|
||||
let Schema = mongoose.Schema;
|
||||
|
||||
|
||||
let ProxyInfoSchema = new Schema({
|
||||
//http, socket4, socket5
|
||||
type: {type: String},
|
||||
link: {type: String},
|
||||
info: {type: String},
|
||||
last_check: {type: Date},
|
||||
//相应时间
|
||||
response_time: {type: Number},
|
||||
// 1: 存活, 0: 未知,-1: 不可用
|
||||
status: {type: Number, default: 0},
|
||||
// 连续尝试次数
|
||||
try_count: {type: Number, default: 0},
|
||||
}, {
|
||||
collection: 'proxy_info',
|
||||
timestamps: true
|
||||
});
|
||||
|
||||
|
||||
let ProxyInfoModel = mongoose.model('ProxyInfo', ProxyInfoSchema);
|
||||
ProxyInfoModel.updateOne = async function (link, record) {
|
||||
const query = {link: link};
|
||||
const options = {upsert: true, setDefaultsOnInsert:true};
|
||||
await ProxyInfoModel.update(query, record, options);
|
||||
}
|
||||
|
||||
// 返回所有待检查的代理
|
||||
ProxyInfoModel.needCheckList = function() {
|
||||
return ProxyInfoModel.find({status: {$in: [0 ,1]}});
|
||||
}
|
||||
|
||||
// 所有可用代理
|
||||
ProxyInfoModel.availableList = function() {
|
||||
return ProxyInfoModel.find({status: 1});
|
||||
}
|
||||
|
||||
ProxyInfoModel.randomProxy = function () {
|
||||
return ProxyInfoModel.find({status: 1}).limit(1);
|
||||
}
|
||||
|
||||
export default ProxyInfoModel;
|
@ -145,12 +145,9 @@ const parseAllMovie = async (category) => {
|
||||
|
||||
export default {
|
||||
run: async () => {
|
||||
// await parseAllMovie('movie');
|
||||
// await parseAllMovie('tv');
|
||||
// await parseAllMovie('show');
|
||||
await parseAllMovie('movie');
|
||||
await parseAllMovie('tv');
|
||||
await parseAllMovie('show');
|
||||
await parseAllMovie('cartoon');
|
||||
// console.log('all done');
|
||||
// let html = await netUtil.getData('https://wechat-test.kingsome.cn/', {})
|
||||
// console.log(html);
|
||||
}
|
||||
}
|
||||
|
144
src/sites/proxy.js
Normal file
144
src/sites/proxy.js
Normal file
@ -0,0 +1,144 @@
|
||||
import ProxyInfo from '../models/spider/ProxyInfo';
|
||||
import netUtil from "../utils/net.util";
|
||||
import cheerio from "cheerio";
|
||||
import generalQueue from '../utils/general.queue';
|
||||
import proxyUtil from '../utils/proxy.util';
|
||||
|
||||
/* *
|
||||
* 获取www.xiladaili.com的代理数据
|
||||
* http://www.xiladaili.com/https/1/
|
||||
* */
|
||||
const parseOneXiladaili = async (idx) => {
|
||||
let url = `http://www.xiladaili.com/https/${idx + 1}/`;
|
||||
console.log('begin parse page:', url);
|
||||
try {
|
||||
let html = await netUtil.getData(url, {})
|
||||
if (html) {
|
||||
const $ = cheerio.load(html);
|
||||
const trArr = $('.fl-table tbody tr');
|
||||
$(trArr).each(async function (i, tr) {
|
||||
const link = $(this).find('td').first().text();
|
||||
const record = {
|
||||
link: link,
|
||||
type: 'http',
|
||||
info: $(this).find('td').eq(3).text()
|
||||
}
|
||||
await ProxyInfo.updateOne(link, record)
|
||||
});
|
||||
}
|
||||
} catch (err) {
|
||||
console.log(err);
|
||||
}
|
||||
}
|
||||
/* *
|
||||
* 获取www.xicidaili.com的代理数据
|
||||
* https://www.xicidaili.com/nn/1
|
||||
* */
|
||||
const parseOneXicidaili = async (idx) => {
|
||||
let url = `https://www.xicidaili.com/nn/${idx + 1}`;
|
||||
console.log('begin parse page:', url);
|
||||
try {
|
||||
let html = await netUtil.getData(url, {})
|
||||
if (html) {
|
||||
const $ = cheerio.load(html);
|
||||
const trs = $("#ip_list tr");
|
||||
for (let i = 1; i < trs.length; i++) {
|
||||
const tds = trs.eq(i).children("td");
|
||||
const link = `${tds.eq(1).text()}:${tds.eq(2).text()}`
|
||||
const type = tds.eq(5).text();
|
||||
const info = tds.eq(3).text();
|
||||
let speed = tds.eq(6).children("div").attr("title");
|
||||
speed = speed.substring(0, speed.length - 1);
|
||||
let connectTime = tds.eq(7).children("div").attr("title");
|
||||
connectTime = connectTime.substring(0, connectTime.length - 1);
|
||||
if (speed <= 5 && connectTime <= 1 && type === 'HTTPS') { //用速度和连接时间筛选一轮
|
||||
const record = {
|
||||
link: link,
|
||||
type: 'http',
|
||||
info: info
|
||||
}
|
||||
await ProxyInfo.updateOne(link, record)
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (err) {
|
||||
console.log(err);
|
||||
}
|
||||
}
|
||||
|
||||
const checkAndUpdate = async (record) => {
|
||||
console.log('begin check:', record.link);
|
||||
try {
|
||||
let time = await proxyUtil.checkProxy(`http://${record.link}`);
|
||||
record.status = 1;
|
||||
record.response_time = time;
|
||||
} catch (err) {
|
||||
console.log('not available:', record.link);
|
||||
record.status = -1;
|
||||
}
|
||||
record.try_count += 1;
|
||||
record.last_check = new Date();
|
||||
try {
|
||||
await record.save();
|
||||
} catch (err) {
|
||||
console.log('error update proxy record', record.link);
|
||||
}
|
||||
return record.status > 0;
|
||||
}
|
||||
|
||||
const parseXiladaili = async (maxPage) => {
|
||||
for (let i = 0; i < maxPage; i++) {
|
||||
try {
|
||||
await parseOneXiladaili(i);
|
||||
} catch (err) {
|
||||
console.log(err);
|
||||
}
|
||||
}
|
||||
console.log('finish parse all page, Xiladaili');
|
||||
}
|
||||
const parseXicidaili = async (maxPage) => {
|
||||
for (let i = 0; i < maxPage; i++) {
|
||||
try {
|
||||
await parseOneXicidaili(i);
|
||||
} catch (err) {
|
||||
console.log(err);
|
||||
}
|
||||
}
|
||||
console.log('finish parse all page, Xicidaili');
|
||||
}
|
||||
const checkAllProxy = async () => {
|
||||
let records;
|
||||
return new Promise(async (resolve, reject) => {
|
||||
console.time('all')
|
||||
generalQueue.setCb(function () {
|
||||
console.log('checkAllProxy finished');
|
||||
resolve()
|
||||
})
|
||||
try {
|
||||
records = await ProxyInfo.needCheckList();
|
||||
} catch (err) {
|
||||
reject(err);
|
||||
}
|
||||
for(let record of records) {
|
||||
try {
|
||||
generalQueue.addQueue({
|
||||
run: async function () {
|
||||
await checkAndUpdate(record);
|
||||
}
|
||||
})
|
||||
} catch (err) {}
|
||||
}
|
||||
})
|
||||
|
||||
}
|
||||
export default {
|
||||
run: async () => {
|
||||
try {
|
||||
// await parseXiladaili(20);
|
||||
// await parseXicidaili(20);
|
||||
await checkAllProxy();
|
||||
} catch (err) {
|
||||
console.log(err);
|
||||
}
|
||||
}
|
||||
}
|
@ -1,5 +1,6 @@
|
||||
import async from 'async';
|
||||
|
||||
let finishCb;
|
||||
/**
|
||||
* 操作队列
|
||||
* */
|
||||
@ -14,6 +15,7 @@ let q = async.queue( async (reqObj, cb) => {
|
||||
q.drain = function(){
|
||||
console.info('all queue done');
|
||||
console.timeEnd('all');
|
||||
(finishCb) && (finishCb());
|
||||
};
|
||||
module.exports = {
|
||||
addQueue(obj) {
|
||||
@ -22,5 +24,8 @@ module.exports = {
|
||||
console.log('error parse: ', obj, err);
|
||||
}
|
||||
});
|
||||
},
|
||||
setCb(cb) {
|
||||
finishCb = cb;
|
||||
}
|
||||
}
|
||||
|
@ -1,8 +1,6 @@
|
||||
import request from 'request';
|
||||
import Promise from 'bluebird';
|
||||
import proxy from './proxys';
|
||||
import random_useragent from 'random-useragent';
|
||||
let agent = require('socks5-http-client/lib/Agent')
|
||||
|
||||
|
||||
const iconv = require('iconv-lite');
|
||||
@ -55,6 +53,7 @@ export default {
|
||||
(gzip) && (options.gzip = true);
|
||||
return requestData(options, encoding);
|
||||
},
|
||||
// 通过代理来get数据
|
||||
getDataProxy(url, header, encoding, gzip) {
|
||||
header = header || {
|
||||
'Cache-Control': 'no-cache',
|
||||
@ -65,16 +64,55 @@ export default {
|
||||
url: url,
|
||||
headers: header,
|
||||
proxy: proxy.randomProxy(),
|
||||
// agentClass: agent,
|
||||
// agentOptions: {
|
||||
// socksHost: '101.71.41.169',
|
||||
// socksPort: 43,
|
||||
// }
|
||||
};
|
||||
if (encoding) {
|
||||
options.encoding = null;
|
||||
}
|
||||
(gzip) && (options.gzip = true);
|
||||
return requestData(options, encoding);
|
||||
},
|
||||
// 检查proxy是否可用
|
||||
async checkProxy(link){
|
||||
let header = {
|
||||
'Cache-Control': 'no-cache',
|
||||
'User-Agent': random_useragent.getRandom(),
|
||||
'Connection':'close',
|
||||
}
|
||||
//https://wechat-test.kingsome.cn/api/stat
|
||||
//https://www.baidu.com/favicon.ico
|
||||
const options = {
|
||||
url: 'https://wechat-test.kingsome.cn/api/stat',
|
||||
proxy: link,
|
||||
method: 'GET',
|
||||
headers: header,
|
||||
time : true,
|
||||
followRedirect: false,
|
||||
// pool: false,
|
||||
}
|
||||
return new Promise((resolve, reject) => {
|
||||
let req = request(options, (err, response, body) => {
|
||||
if (err) {
|
||||
return reject(err);
|
||||
}
|
||||
console.log('status code:', response.statusCode, 'body:', body);
|
||||
if (response.statusCode === 200 && body === '1') {
|
||||
console.log('Request time in ms', response.elapsedTime);
|
||||
resolve(response.elapsedTime);
|
||||
} else {
|
||||
reject(new Error(' server response code: ' + response.statusCode + ' with url: ' + options.url));
|
||||
}
|
||||
});
|
||||
setTimeout(() => {
|
||||
console.log('check proxy timeout', link);
|
||||
try {
|
||||
req.abort(new Error('timeout'));
|
||||
} catch (err) {
|
||||
console.log(err);
|
||||
reject()
|
||||
}
|
||||
reject()
|
||||
}, 5000);
|
||||
})
|
||||
|
||||
}
|
||||
}
|
||||
|
30
src/utils/proxy.util.js
Normal file
30
src/utils/proxy.util.js
Normal file
@ -0,0 +1,30 @@
|
||||
import stringUtil from './string.util';
|
||||
import ProxyInfo from '../models/spider/ProxyInfo';
|
||||
|
||||
let request = require('superagent');
|
||||
require('superagent-proxy')(request);
|
||||
|
||||
let proxys = [];
|
||||
|
||||
export default {
|
||||
async randomProxy() {
|
||||
if (proxys.length === 0) {
|
||||
proxys = await ProxyInfo.availableList();
|
||||
}
|
||||
return proxys[stringUtil.randomNum(0, proxys.length - 1)];
|
||||
},
|
||||
async checkProxy(proxy){
|
||||
return new Promise(async (resolve, reject) => {
|
||||
try {
|
||||
let response = await request.get('https://wechat-test.kingsome.cn/api/stat').proxy(proxy).timeout(5000);
|
||||
if(response.statusCode === 200 && response.text === '1' ){
|
||||
resolve();
|
||||
} else {
|
||||
reject(new Error(' server response code: ' + response.statusCode));
|
||||
}
|
||||
} catch (err) {
|
||||
reject(err);
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
@ -1,14 +0,0 @@
|
||||
import stringUtil from './string.util';
|
||||
const proxys = [
|
||||
'http://113.200.56.13:8010',
|
||||
'http://65.52.174.40:80',
|
||||
'http://165.22.254.199:8080',
|
||||
'http://88.255.101.241:8080',
|
||||
'http://117.197.117.50:8080'
|
||||
];
|
||||
export default {
|
||||
|
||||
randomProxy() {
|
||||
return proxys[stringUtil.randomNum(0, proxys.length - 1)];
|
||||
}
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user