spider/src/sites/proxy.js
2019-05-09 17:26:04 +08:00

189 lines
5.1 KiB
JavaScript

import ProxyInfo from '../models/spider/ProxyInfo';
import netUtil from "../utils/net.util";
import cheerio from "cheerio";
import generalQueue from '../utils/general.queue';
import proxyUtil from '../utils/proxy.util';
/**
* 负载抓取开放代理的task
* 1. www.xiladaili.com
* 2. www.xicidaili.com
* 3. www.nimadaili.com
* */
/* *
* 获取www.xiladaili.com的代理数据
* http://www.xiladaili.com/https/1/
* */
const parseOneXiladaili = async (idx) => {
let url = `http://www.xiladaili.com/https/${idx + 1}/`;
console.log('begin parse page:', url);
try {
let html = await proxyUtil.getDataProxy(url)
if (html) {
const $ = cheerio.load(html);
const trArr = $('.fl-table tbody tr');
$(trArr).each(async function (i, tr) {
const link = $(this).find('td').first().text();
const record = {
link: link,
type: 'http',
info: $(this).find('td').eq(3).text()
}
await ProxyInfo.updateOne(link, record)
});
}
} catch (err) {
console.log(`#######################error parse page:`, url);
}
}
/* *
* 获取www.xicidaili.com的代理数据
* https://www.xicidaili.com/nn/1
* */
const parseOneXicidaili = async (idx) => {
let url = `https://www.xicidaili.com/wn/${idx + 1}`;
console.log('begin parse page:', url);
try {
let html = await proxyUtil.getDataProxy(url)
if (html) {
const $ = cheerio.load(html);
const trs = $("#ip_list tr");
for (let i = 1; i < trs.length; i++) {
const tds = trs.eq(i).children("td");
const link = `${tds.eq(1).text()}:${tds.eq(2).text()}`
const type = tds.eq(5).text();
const info = tds.eq(3).text();
let speed = tds.eq(6).children("div").attr("title");
speed = speed.substring(0, speed.length - 1);
let connectTime = tds.eq(7).children("div").attr("title");
connectTime = connectTime.substring(0, connectTime.length - 1);
if (speed <= 5 && connectTime <= 1 && type === 'HTTPS') { //用速度和连接时间筛选一轮
const record = {
link: link,
type: 'http',
info: info
}
await ProxyInfo.updateOne(link, record)
}
}
}
} catch (err) {
console.log(`#######################error parse page:`, url);
}
}
// http://www.nimadaili.com/https/1/
const parseNimadaili = async (idx) => {
let url = `http://www.nimadaili.com/https/${idx + 1}/`;
console.log('begin parse page:', url);
try {
let html = await proxyUtil.getDataProxy(url)
if (html) {
const $ = cheerio.load(html);
const trArr = $('.fl-table tbody tr');
$(trArr).each(async function (i, tr) {
const link = $(this).find('td').first().text();
const record = {
link: link,
type: 'http',
info: $(this).find('td').eq(3).text()
}
await ProxyInfo.updateOne(link, record)
});
}
} catch (err) {
console.log(`#######################error parse page:`, url);
}
}
// 检查代理状态
const checkAndUpdate = async (record) => {
console.log('begin check:', record.link);
try {
let time = await proxyUtil.checkProxy(`http://${record.link}`);
record.status = 1;
record.response_time = time;
} catch (err) {
console.log('not available:', record.link);
record.status = -1;
record.err_count += 1;
}
record.try_count += 1;
record.last_check = new Date();
try {
await record.save();
} catch (err) {
console.log('error update proxy record', record.link);
}
return record.status > 0;
}
const parseXiladaili = async (pageCount, beginPage = 0) => {
const maxPage = beginPage + pageCount;
for (let i = beginPage; i < maxPage; i++) {
try {
await parseOneXiladaili(i);
} catch (err) {
console.log(err);
}
}
console.log('finish parse all page, Xiladaili');
}
const parseXicidaili = async (maxPage) => {
for (let i = 0; i < maxPage; i++) {
try {
await parseOneXicidaili(i);
} catch (err) {
console.log(err);
}
}
console.log('finish parse all page, Xicidaili');
}
const parseNimadailidaili = async (pageCount, beginPage = 0) => {
const maxPage = beginPage + pageCount;
for (let i = beginPage; i < maxPage; i++) {
try {
await parseNimadaili(i);
} catch (err) {
console.log(err);
}
}
console.log('finish parse all page, nimadailidaili');
}
const checkAllProxy = async (all) => {
let records;
return new Promise(async (resolve, reject) => {
console.time('all')
generalQueue.setCb(function () {
console.log('checkAllProxy finished');
resolve()
})
try {
records = await ProxyInfo.needCheckList(all);
} catch (err) {
reject(err);
}
for(let record of records) {
try {
generalQueue.addQueue({
run: async function () {
await checkAndUpdate(record);
}
})
} catch (err) {}
}
})
}
export default {
run: async () => {
try {
await parseXiladaili(50);
// await parseXicidaili(20);
await parseNimadailidaili(50);
await checkAllProxy(true);
} catch (err) {
console.log(err);
}
}
}