import ProxyInfo from '../models/spider/ProxyInfo'; import netUtil from "../utils/net.util"; import cheerio from "cheerio"; import generalQueue from '../utils/general.queue'; import proxyUtil from '../utils/proxy.util'; /** * 负载抓取开放代理的task * 1. www.xiladaili.com * 2. www.xicidaili.com * 3. www.nimadaili.com * */ /* * * 获取www.xiladaili.com的代理数据 * http://www.xiladaili.com/https/1/ * */ const parseOneXiladaili = async (idx) => { let url = `http://www.xiladaili.com/https/${idx + 1}/`; console.log('begin parse page:', url); try { let html = await proxyUtil.getDataProxy(url) if (html) { const $ = cheerio.load(html); const trArr = $('.fl-table tbody tr'); $(trArr).each(async function (i, tr) { const link = $(this).find('td').first().text(); const record = { link: link, type: 'http', info: $(this).find('td').eq(3).text() } await ProxyInfo.updateOne(link, record) }); } } catch (err) { console.log(`#######################error parse page:`, url); } } /* * * 获取www.xicidaili.com的代理数据 * https://www.xicidaili.com/nn/1 * */ const parseOneXicidaili = async (idx) => { let url = `https://www.xicidaili.com/wn/${idx + 1}`; console.log('begin parse page:', url); try { let html = await proxyUtil.getDataProxy(url) if (html) { const $ = cheerio.load(html); const trs = $("#ip_list tr"); for (let i = 1; i < trs.length; i++) { const tds = trs.eq(i).children("td"); const link = `${tds.eq(1).text()}:${tds.eq(2).text()}` const type = tds.eq(5).text(); const info = tds.eq(3).text(); let speed = tds.eq(6).children("div").attr("title"); speed = speed.substring(0, speed.length - 1); let connectTime = tds.eq(7).children("div").attr("title"); connectTime = connectTime.substring(0, connectTime.length - 1); if (speed <= 5 && connectTime <= 1 && type === 'HTTPS') { //用速度和连接时间筛选一轮 const record = { link: link, type: 'http', info: info } await ProxyInfo.updateOne(link, record) } } } } catch (err) { console.log(`#######################error parse page:`, url); } } // http://www.nimadaili.com/https/1/ const parseNimadaili = async (idx) => { let url = `http://www.nimadaili.com/https/${idx + 1}/`; console.log('begin parse page:', url); try { let html = await proxyUtil.getDataProxy(url) if (html) { const $ = cheerio.load(html); const trArr = $('.fl-table tbody tr'); $(trArr).each(async function (i, tr) { const link = $(this).find('td').first().text(); const record = { link: link, type: 'http', info: $(this).find('td').eq(3).text() } await ProxyInfo.updateOne(link, record) }); } } catch (err) { console.log(`#######################error parse page:`, url); } } // 检查代理状态 const checkAndUpdate = async (record) => { console.log('begin check:', record.link); try { let time = await proxyUtil.checkProxy(`http://${record.link}`); record.status = 1; record.response_time = time; } catch (err) { console.log('not available:', record.link); record.status = -1; record.err_count += 1; } record.try_count += 1; record.last_check = new Date(); try { await record.save(); } catch (err) { console.log('error update proxy record', record.link); } return record.status > 0; } const parseXiladaili = async (pageCount, beginPage = 0) => { const maxPage = beginPage + pageCount; for (let i = beginPage; i < maxPage; i++) { try { await parseOneXiladaili(i); } catch (err) { console.log(err); } } console.log('finish parse all page, Xiladaili'); } const parseXicidaili = async (maxPage) => { for (let i = 0; i < maxPage; i++) { try { await parseOneXicidaili(i); } catch (err) { console.log(err); } } console.log('finish parse all page, Xicidaili'); } const parseNimadailidaili = async (pageCount, beginPage = 0) => { const maxPage = beginPage + pageCount; for (let i = beginPage; i < maxPage; i++) { try { await parseNimadaili(i); } catch (err) { console.log(err); } } console.log('finish parse all page, nimadailidaili'); } const checkAllProxy = async (all) => { let records; return new Promise(async (resolve, reject) => { console.time('all') generalQueue.setCb(function () { console.log('checkAllProxy finished'); resolve() }) try { records = await ProxyInfo.needCheckList(all); } catch (err) { reject(err); } for(let record of records) { try { generalQueue.addQueue({ run: async function () { await checkAndUpdate(record); } }) } catch (err) {} } }) } export default { run: async () => { try { await parseXiladaili(50); // await parseXicidaili(20); await parseNimadailidaili(50); await checkAllProxy(true); } catch (err) { console.log(err); } } }