From 20492021ea48268661a0d8374b6ba80699e13c86 Mon Sep 17 00:00:00 2001 From: zhl Date: Wed, 8 May 2019 13:49:18 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E7=94=B5=E5=BD=B1=E6=8A=93?= =?UTF-8?q?=E5=8F=96=E9=80=BB=E8=BE=91?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/models/Movies.js | 2 + src/sites/dandanzan.js | 131 ++++++++++++++++++++++----------------- src/utils/net.util.js | 18 +++++- src/utils/proxys.js | 8 ++- src/utils/string.util.js | 14 +++++ 5 files changed, 111 insertions(+), 62 deletions(-) diff --git a/src/models/Movies.js b/src/models/Movies.js index 88e6bdb..b1173fb 100644 --- a/src/models/Movies.js +++ b/src/models/Movies.js @@ -42,6 +42,8 @@ const Movies = new Schema({ open: {type: Boolean, default: false}, // 购买价格 price: {type: Number, default: 10}, + // 是否是新记录 + is_new: {type: Boolean, default: true}, // 视频分类,movie: 电影;tv: 电视剧; show: 综艺节目 category: {type: String}, // 是否已删除 diff --git a/src/sites/dandanzan.js b/src/sites/dandanzan.js index 4485b91..f659aa2 100644 --- a/src/sites/dandanzan.js +++ b/src/sites/dandanzan.js @@ -5,25 +5,23 @@ import Movie from '../models/Movies'; import generalQueue from '../utils/general.queue'; const URL_BASE = 'https://www.dandanzan.com' +const maxIdx = 100000; /** * 处理一个页面 * @param {String} subLink * @param {String} category, 种类, movie: 电影, tv: 电视剧, show: 综艺 + * @param {Number} sortIdx * */ -const parseOnePage = async (subLink, category) => { +const parseOnePage = async (subLink, category, sortIdx) => { const url = `${URL_BASE}${subLink}` try { - let html = await netUtil.getData(url, {}) + let html = await netUtil.getDataProxy(url, {}) const $ = cheerio.load(html); if ($('.error404').text()) { console.log(`>>>>>>>>>>>> ${url} not found`); } else { let resourceStr = stringUtil.getContentByReg(html,/links='(.+?)\|'/); - let idx = parseInt(subLink.replace('/dianying/', '') - .replace('.html', '') - .replace('/dongman/', '') - .replace('/dianshiju/', '') - .replace('/zongyi/', '')); + let idx = parseInt(stringUtil.findByReg(subLink, /\/.+?\/(\d+?)\.html/)); const type = $('meta[property="og:video:class"]').attr('content'); let typeArr = type ? type.split(',') : []; let arr = resourceStr.replace(/\|/g, '#').split('#'); @@ -39,7 +37,7 @@ const parseOnePage = async (subLink, category) => { let region = $('meta[property="og:video:area"]').attr('content'); let record = await Movie.findByGid(idx); if (!record) { - record = { + record = new Movie({ gid: idx, name: $('meta[property="og:title"]').attr('content'), type: typeArr, @@ -53,63 +51,24 @@ const parseOnePage = async (subLink, category) => { region: region ? region.split(',') : [], year: Number($('meta[property="og:video:release_date"]').attr('content')), category: category, - } + sortIdx: sortIdx, + is_new: true, + }) } else { record.resources = resourceArr; + record.sortIdx = sortIdx; } await record.save(); - console.log(`@@@@@ ${subLink} @ ${record.name} saved`); + console.log(`@@@@@ ${sortIdx}: ${subLink} @ ${record.name} saved`); } } catch (err) { console.log(err); } } -const parseListPage = async (subPage, category) => { - const url = `${URL_BASE}${subPage}` - console.log(`begin parse category: ${category} page: ${subPage}`); - let html; - try { - html = await netUtil.getData(url, {}) - } catch (err) { - console.log(err); - } - if (html) { - const $ = cheerio.load(html); - let hrefs = $('.thumbnail'); - let pages = []; - $(hrefs).each(function(i, link){ - pages.push($(this).attr('href')); - }); - for(let page of pages) { - try { - generalQueue.addQueue({ - run: async function () { - await parseOnePage(page, category); - } - }) - } catch (err) { - console.log(err); - } - } - if ($('.next-page')) { - let nextStr = $('.next-page a').attr('href'); - console.log('has next page: ', nextStr); - try { - await parseListPage(nextStr, category); - } catch (err) { - console.log(err); - } - } else { - console.log('########################### ALL LIST PAGE END ###########################'); - } - } - - console.log(`end parse category: ${category} page: ${subPage}`); -} -const parseAllMovie = async (category) => { - console.time('all'); +const parseListPage = async (idx, category) => { let subName = 'dianying'; + let index = 0; switch (category) { case 'movie': subName = 'dianying'; @@ -124,16 +83,72 @@ const parseAllMovie = async (category) => { subName = 'dongman'; break; } - const subPage = `/${subName}/index.html` - await parseListPage(subPage, category); + const subPage = !idx ? `/${subName}/----onclick.html` : `/${subName}/---${idx}-onclick.html` + const url = `${URL_BASE}${subPage}` + console.log(`begin parse category: ${category} page: ${subPage}`); + let html; + try { + html = await netUtil.getDataProxy(url, {}) + } catch (err) { + console.log(err); + } + if (html) { + const $ = cheerio.load(html); + let hrefs = $('.thumbnail'); + let pages = []; + $(hrefs).each(function(i, link){ + pages.push($(this).attr('href')); + }); + for(let page of pages) { + try { + let sortIdx = maxIdx - (idx * 24 + (index ++) ); + generalQueue.addQueue({ + run: async function () { + await parseOnePage(page, category, sortIdx); + } + }) + } catch (err) { + console.log(err); + } + } + if (!idx) { + const lastPage = $('.pagination ul li').last().find('a').attr('href'); + return parseInt(stringUtil.findByReg(lastPage, /\/.+?\/---(\d+?)-onclick\.html/)); + } else { + return 1; + } + // if ($('.next-page')) { + // let nextStr = $('.next-page a').attr('href'); + // console.log('has next page: ', nextStr); + // try { + // await parseListPage(nextStr, category); + // } catch (err) { + // console.log(err); + // } + // } else { + // console.log('########################### ALL LIST PAGE END ###########################'); + // } + } else { + return 1; + } +} +const parseAllMovie = async (category) => { + console.time('all'); + let allPageNo = await parseListPage(0, category); + console.log('app page is', allPageNo); + if (allPageNo > 1) { + for (let i = 1; i <= allPageNo; i++) { + await parseListPage(i, category); + } + } } export default { run: async () => { - await parseAllMovie('movie'); + // await parseAllMovie('movie'); // await parseAllMovie('tv'); // await parseAllMovie('show'); - // await parseAllMovie('cartoon'); + await parseAllMovie('cartoon'); // console.log('all done'); // let html = await netUtil.getData('https://wechat-test.kingsome.cn/', {}) // console.log(html); diff --git a/src/utils/net.util.js b/src/utils/net.util.js index 342636c..6ba968c 100644 --- a/src/utils/net.util.js +++ b/src/utils/net.util.js @@ -14,7 +14,7 @@ const requestData = (options, encoding) => { return reject(err); } if (response.statusCode >= 300) { - return reject(new Error('server response code: ' + response.statusCode)); + return reject(new Error(' server response code: ' + response.statusCode + ' with url: ' + options.url)); } if (encoding) { body = iconv.decode(body, encoding); @@ -40,6 +40,22 @@ export default { return requestData(options); }, getData(url, header, encoding, gzip) { + header = header || { + 'Cache-Control': 'no-cache', + } + header['User-Agent'] = random_useragent.getRandom(); + const options = { + method: 'GET', + url: url, + headers: header, + }; + if (encoding) { + options.encoding = null; + } + (gzip) && (options.gzip = true); + return requestData(options, encoding); + }, + getDataProxy(url, header, encoding, gzip) { header = header || { 'Cache-Control': 'no-cache', } diff --git a/src/utils/proxys.js b/src/utils/proxys.js index 4f93f0e..0383fde 100644 --- a/src/utils/proxys.js +++ b/src/utils/proxys.js @@ -1,8 +1,10 @@ import stringUtil from './string.util'; const proxys = [ - 'http://101.71.41.169:443', - 'http://116.196.81.58:3128', - 'http://113.200.56.13:8010' + 'http://113.200.56.13:8010', + 'http://65.52.174.40:80', + 'http://165.22.254.199:8080', + 'http://88.255.101.241:8080', + 'http://117.197.117.50:8080' ]; export default { diff --git a/src/utils/string.util.js b/src/utils/string.util.js index 1129549..13f8d3f 100644 --- a/src/utils/string.util.js +++ b/src/utils/string.util.js @@ -21,6 +21,20 @@ export default { return content.replace(/<.+?>/g, '').replace(/\s/g, ''); } }, + /** + * 根据正则查找内容 + * @param {string} content + * @param {RegExp} re 正则表达式, 例如:/\/.+?\/(\d+?)\.html/ + * @return {string} 匹配到的内容, 没找到则返回'' + * */ + findByReg(content, re) { + const contents = content.match(re); + let result = ''; + if (contents) { + if (contents.length > 1) result = contents[1]; + } + return result; + }, randomNum(minNum, maxNum) { return parseInt(Math.random()*(maxNum-minNum+1)+minNum, 10); },