189 lines
5.1 KiB
JavaScript
189 lines
5.1 KiB
JavaScript
import ProxyInfo from '../models/spider/ProxyInfo';
|
|
import netUtil from "../utils/net.util";
|
|
import cheerio from "cheerio";
|
|
import generalQueue from '../utils/general.queue';
|
|
import proxyUtil from '../utils/proxy.util';
|
|
/**
|
|
* 负载抓取开放代理的task
|
|
* 1. www.xiladaili.com
|
|
* 2. www.xicidaili.com
|
|
* 3. www.nimadaili.com
|
|
* */
|
|
|
|
/* *
|
|
* 获取www.xiladaili.com的代理数据
|
|
* http://www.xiladaili.com/https/1/
|
|
* */
|
|
const parseOneXiladaili = async (idx) => {
|
|
let url = `http://www.xiladaili.com/https/${idx + 1}/`;
|
|
console.log('begin parse page:', url);
|
|
try {
|
|
let html = await proxyUtil.getDataProxy(url)
|
|
if (html) {
|
|
const $ = cheerio.load(html);
|
|
const trArr = $('.fl-table tbody tr');
|
|
$(trArr).each(async function (i, tr) {
|
|
const link = $(this).find('td').first().text();
|
|
const record = {
|
|
link: link,
|
|
type: 'http',
|
|
info: $(this).find('td').eq(3).text()
|
|
}
|
|
await ProxyInfo.updateOne(link, record)
|
|
});
|
|
}
|
|
} catch (err) {
|
|
console.log(`#######################error parse page:`, url);
|
|
}
|
|
}
|
|
/* *
|
|
* 获取www.xicidaili.com的代理数据
|
|
* https://www.xicidaili.com/nn/1
|
|
* */
|
|
const parseOneXicidaili = async (idx) => {
|
|
let url = `https://www.xicidaili.com/wn/${idx + 1}`;
|
|
console.log('begin parse page:', url);
|
|
try {
|
|
let html = await proxyUtil.getDataProxy(url)
|
|
if (html) {
|
|
const $ = cheerio.load(html);
|
|
const trs = $("#ip_list tr");
|
|
for (let i = 1; i < trs.length; i++) {
|
|
const tds = trs.eq(i).children("td");
|
|
const link = `${tds.eq(1).text()}:${tds.eq(2).text()}`
|
|
const type = tds.eq(5).text();
|
|
const info = tds.eq(3).text();
|
|
let speed = tds.eq(6).children("div").attr("title");
|
|
speed = speed.substring(0, speed.length - 1);
|
|
let connectTime = tds.eq(7).children("div").attr("title");
|
|
connectTime = connectTime.substring(0, connectTime.length - 1);
|
|
if (speed <= 5 && connectTime <= 1 && type === 'HTTPS') { //用速度和连接时间筛选一轮
|
|
const record = {
|
|
link: link,
|
|
type: 'http',
|
|
info: info
|
|
}
|
|
await ProxyInfo.updateOne(link, record)
|
|
}
|
|
}
|
|
}
|
|
} catch (err) {
|
|
console.log(`#######################error parse page:`, url);
|
|
}
|
|
}
|
|
|
|
// http://www.nimadaili.com/https/1/
|
|
const parseNimadaili = async (idx) => {
|
|
let url = `http://www.nimadaili.com/https/${idx + 1}/`;
|
|
console.log('begin parse page:', url);
|
|
try {
|
|
let html = await proxyUtil.getDataProxy(url)
|
|
if (html) {
|
|
const $ = cheerio.load(html);
|
|
const trArr = $('.fl-table tbody tr');
|
|
$(trArr).each(async function (i, tr) {
|
|
const link = $(this).find('td').first().text();
|
|
const record = {
|
|
link: link,
|
|
type: 'http',
|
|
info: $(this).find('td').eq(3).text()
|
|
}
|
|
await ProxyInfo.updateOne(link, record)
|
|
});
|
|
}
|
|
} catch (err) {
|
|
console.log(`#######################error parse page:`, url);
|
|
}
|
|
}
|
|
// 检查代理状态
|
|
const checkAndUpdate = async (record) => {
|
|
console.log('begin check:', record.link);
|
|
try {
|
|
let time = await proxyUtil.checkProxy(`http://${record.link}`);
|
|
record.status = 1;
|
|
record.response_time = time;
|
|
} catch (err) {
|
|
console.log('not available:', record.link);
|
|
record.status = -1;
|
|
record.err_count += 1;
|
|
}
|
|
record.try_count += 1;
|
|
record.last_check = new Date();
|
|
try {
|
|
await record.save();
|
|
} catch (err) {
|
|
console.log('error update proxy record', record.link);
|
|
}
|
|
return record.status > 0;
|
|
}
|
|
|
|
const parseXiladaili = async (pageCount, beginPage = 0) => {
|
|
const maxPage = beginPage + pageCount;
|
|
for (let i = beginPage; i < maxPage; i++) {
|
|
try {
|
|
await parseOneXiladaili(i);
|
|
} catch (err) {
|
|
console.log(err);
|
|
}
|
|
}
|
|
console.log('finish parse all page, Xiladaili');
|
|
}
|
|
const parseXicidaili = async (maxPage) => {
|
|
for (let i = 0; i < maxPage; i++) {
|
|
try {
|
|
await parseOneXicidaili(i);
|
|
} catch (err) {
|
|
console.log(err);
|
|
}
|
|
}
|
|
console.log('finish parse all page, Xicidaili');
|
|
}
|
|
const parseNimadailidaili = async (pageCount, beginPage = 0) => {
|
|
const maxPage = beginPage + pageCount;
|
|
for (let i = beginPage; i < maxPage; i++) {
|
|
try {
|
|
await parseNimadaili(i);
|
|
} catch (err) {
|
|
console.log(err);
|
|
}
|
|
}
|
|
console.log('finish parse all page, nimadailidaili');
|
|
}
|
|
const checkAllProxy = async (all) => {
|
|
let records;
|
|
return new Promise(async (resolve, reject) => {
|
|
console.time('all')
|
|
generalQueue.setCb(function () {
|
|
console.log('checkAllProxy finished');
|
|
resolve()
|
|
})
|
|
try {
|
|
records = await ProxyInfo.needCheckList(all);
|
|
} catch (err) {
|
|
reject(err);
|
|
}
|
|
for(let record of records) {
|
|
try {
|
|
generalQueue.addQueue({
|
|
run: async function () {
|
|
await checkAndUpdate(record);
|
|
}
|
|
})
|
|
} catch (err) {}
|
|
}
|
|
})
|
|
|
|
}
|
|
export default {
|
|
run: async () => {
|
|
try {
|
|
await parseXiladaili(50);
|
|
// await parseXicidaili(20);
|
|
await parseNimadailidaili(50);
|
|
await checkAllProxy(true);
|
|
} catch (err) {
|
|
console.log(err);
|
|
}
|
|
}
|
|
}
|