增加nimadaili.com代理数据的抓取

This commit is contained in:
zhl 2019-05-09 16:32:33 +08:00
parent 526b96ed78
commit e1dcbabd1f

View File

@ -12,7 +12,7 @@ const parseOneXiladaili = async (idx) => {
let url = `http://www.xiladaili.com/https/${idx + 1}/`; let url = `http://www.xiladaili.com/https/${idx + 1}/`;
console.log('begin parse page:', url); console.log('begin parse page:', url);
try { try {
let html = await netUtil.getData(url, {}) let html = await proxyUtil.getDataProxy(url)
if (html) { if (html) {
const $ = cheerio.load(html); const $ = cheerio.load(html);
const trArr = $('.fl-table tbody tr'); const trArr = $('.fl-table tbody tr');
@ -27,7 +27,7 @@ const parseOneXiladaili = async (idx) => {
}); });
} }
} catch (err) { } catch (err) {
console.log(err); console.log(`#######################error parse page:`, url);
} }
} }
/* * /* *
@ -35,10 +35,10 @@ const parseOneXiladaili = async (idx) => {
* https://www.xicidaili.com/nn/1 * https://www.xicidaili.com/nn/1
* */ * */
const parseOneXicidaili = async (idx) => { const parseOneXicidaili = async (idx) => {
let url = `https://www.xicidaili.com/nn/${idx + 1}`; let url = `https://www.xicidaili.com/wn/${idx + 1}`;
console.log('begin parse page:', url); console.log('begin parse page:', url);
try { try {
let html = await proxyUtil.getData(url) let html = await proxyUtil.getDataProxy(url)
if (html) { if (html) {
const $ = cheerio.load(html); const $ = cheerio.load(html);
const trs = $("#ip_list tr"); const trs = $("#ip_list tr");
@ -62,10 +62,34 @@ const parseOneXicidaili = async (idx) => {
} }
} }
} catch (err) { } catch (err) {
console.log(err); console.log(`#######################error parse page:`, url);
} }
} }
// http://www.nimadaili.com/https/1/
const parseNimadaili = async (idx) => {
let url = `http://www.nimadaili.com/https/${idx + 1}/`;
console.log('begin parse page:', url);
try {
let html = await proxyUtil.getDataProxy(url)
if (html) {
const $ = cheerio.load(html);
const trArr = $('.fl-table tbody tr');
$(trArr).each(async function (i, tr) {
const link = $(this).find('td').first().text();
const record = {
link: link,
type: 'http',
info: $(this).find('td').eq(3).text()
}
await ProxyInfo.updateOne(link, record)
});
}
} catch (err) {
console.log(`#######################error parse page:`, url);
}
}
// 检查代理状态
const checkAndUpdate = async (record) => { const checkAndUpdate = async (record) => {
console.log('begin check:', record.link); console.log('begin check:', record.link);
try { try {
@ -75,6 +99,7 @@ const checkAndUpdate = async (record) => {
} catch (err) { } catch (err) {
console.log('not available:', record.link); console.log('not available:', record.link);
record.status = -1; record.status = -1;
record.err_count += 1;
} }
record.try_count += 1; record.try_count += 1;
record.last_check = new Date(); record.last_check = new Date();
@ -86,8 +111,9 @@ const checkAndUpdate = async (record) => {
return record.status > 0; return record.status > 0;
} }
const parseXiladaili = async (maxPage) => { const parseXiladaili = async (pageCount, beginPage = 0) => {
for (let i = 0; i < maxPage; i++) { const maxPage = beginPage + pageCount;
for (let i = beginPage; i < maxPage; i++) {
try { try {
await parseOneXiladaili(i); await parseOneXiladaili(i);
} catch (err) { } catch (err) {
@ -106,7 +132,18 @@ const parseXicidaili = async (maxPage) => {
} }
console.log('finish parse all page, Xicidaili'); console.log('finish parse all page, Xicidaili');
} }
const checkAllProxy = async () => { const parseNimadailidaili = async (pageCount, beginPage = 0) => {
const maxPage = beginPage + pageCount;
for (let i = beginPage; i < maxPage; i++) {
try {
await parseNimadaili(i);
} catch (err) {
console.log(err);
}
}
console.log('finish parse all page, nimadailidaili');
}
const checkAllProxy = async (all) => {
let records; let records;
return new Promise(async (resolve, reject) => { return new Promise(async (resolve, reject) => {
console.time('all') console.time('all')
@ -115,7 +152,7 @@ const checkAllProxy = async () => {
resolve() resolve()
}) })
try { try {
records = await ProxyInfo.needCheckList(); records = await ProxyInfo.needCheckList(all);
} catch (err) { } catch (err) {
reject(err); reject(err);
} }
@ -134,9 +171,10 @@ const checkAllProxy = async () => {
export default { export default {
run: async () => { run: async () => {
try { try {
// await parseXiladaili(20); // await parseXiladaili(50, 50);
// await parseXicidaili(20); // await parseXicidaili(20);
await checkAllProxy(); await parseNimadailidaili(50);
await checkAllProxy(true);
} catch (err) { } catch (err) {
console.log(err); console.log(err);
} }