diff --git a/package.json b/package.json index e5b2820..f56f617 100644 --- a/package.json +++ b/package.json @@ -21,6 +21,7 @@ "cheerio": "^1.0.0-rc.3", "dateformat": "^3.0.3", "file-stream-rotator": "^0.4.1", + "free-proxy": "^0.1.5", "fs": "0.0.1-security", "fs-extra": "^7.0.1", "glob": "^7.1.3", @@ -28,8 +29,11 @@ "mkdirp": "^0.5.1", "mongoose": "^5.2.15", "multi-progress": "^2.0.0", + "proxy-lists": "^1.16.0", + "random-useragent": "^0.3.1", "request": "^2.88.0", "request-promise": "^4.2.4", + "socks5-http-client": "^1.0.4", "ws": "^6.1.2" }, "devDependencies": { diff --git a/src/models/Movies.js b/src/models/Movies.js index 551236c..88e6bdb 100644 --- a/src/models/Movies.js +++ b/src/models/Movies.js @@ -3,7 +3,7 @@ import mongoose from 'mongoose'; const Schema = mongoose.Schema; const Movies = new Schema({ - gid: {type: Number, required: true}, + gid: {type: Number, required: true, index: true}, // 电影名 name: {type: String}, nameAlias: {type: String}, @@ -35,9 +35,15 @@ const Movies = new Schema({ // 显示的播放数量 showCount: {type: Number, default: 1000}, // 排序字段, 倒序 - sortIdx: {type: Number}, + sortIdx: {type: Number, default: 0}, + // jc排序字段, 倒序,查询电影时候优先此字段倒序 + sortJc: {type: Number, default: 0}, + // 是否公开 + open: {type: Boolean, default: false}, // 购买价格 - price: {type: Number}, + price: {type: Number, default: 10}, + // 视频分类,movie: 电影;tv: 电视剧; show: 综艺节目 + category: {type: String}, // 是否已删除 deleted: {type: Boolean, default: false}, delete_time: {type: Date}, @@ -53,4 +59,9 @@ MovieModel.findByGid = function(gid) { return MovieModel.findOne({gid: gid, deleted: false}); }; +MovieModel.updateOne = async function (gid, record) { + const query = {gid: gid}; + const options = {upsert: true, setDefaultsOnInsert:true}; + await MovieModel.update(query, record, options); +} export default MovieModel; diff --git a/src/sites/dandanzan.js b/src/sites/dandanzan.js index e5ca4be..d7a2875 100644 --- a/src/sites/dandanzan.js +++ b/src/sites/dandanzan.js @@ -1,57 +1,67 @@ import netUtil from "../utils/net.util"; import cheerio from "cheerio"; import stringUtil from '../utils/string.util'; -import SpiderData from "../models/SpiderData"; import Movie from '../models/Movies'; import generalQueue from '../utils/general.queue'; -const parseOnePage = async (subLink) => { - // const url = `https://www.dandanzan.com/dianying/${idx}.html` - const url = `https://www.dandanzan.com${subLink}` +const URL_BASE = 'https://www.dandanzan.com' +/** + * 处理一个页面 + * @param {String} subLink + * @param {String} category, 种类, movie: 电影, tv: 电视剧, show: 综艺 + * */ +const parseOnePage = async (subLink, category) => { + const url = `${URL_BASE}${subLink}` try { let html = await netUtil.getData(url, {}) const $ = cheerio.load(html); if ($('.error404').text()) { console.log(`>>>>>>>>>>>> ${url} not found`); } else { - let link = stringUtil.getContentByReg(html,/links='(.+?)\|'/); - let idx = parseInt(subLink.replace('/dianying/', '').replace('.html', '')); - let obj = { - id: idx, - daoYan: $('meta[property="og:video:director"]').attr('content'), - zhuYan: $('meta[property="og:video:actor"]').attr('content'), - grade: $('meta[property="og:video:score"]').attr('content'), + let resourceStr = stringUtil.getContentByReg(html,/links='(.+?)\|'/); + let idx = parseInt(subLink.replace('/dianying/', '').replace('.html', '') + .replace('/dongman/', '') + .replace('/dianshiju/', '') + .replace('/zongyi/', '')); + const type = $('meta[property="og:video:class"]').attr('content'); + let typeArr = type ? type.split(',') : []; + let arr = resourceStr.replace(/\|/g, '#').split('#'); + let resourceArr = []; + for (let str of arr) { + resourceArr.push({ + title: str.substring(0, str.indexOf('$')), + link: str.substring(str.indexOf('$') + 1) + }) + } + let daoYan = $('meta[property="og:video:director"]').attr('content'); + let zhuYan = $('meta[property="og:video:actor"]').attr('content').replace(/ \/ /g, ','); + let region = $('meta[property="og:video:area"]').attr('content'); + let record = { + gid: idx, + name: $('meta[property="og:title"]').attr('content'), + type: typeArr, + resources: resourceArr, + daoYan: daoYan ? daoYan.split(',') : [], + zhuYan: zhuYan ? zhuYan.split(',') : [], + score: Number($('meta[property="og:video:score"]').attr('content')), img: $('meta[property="og:image"]').attr('content'), introduce: $('meta[property="og:description"]').attr('content'), - language: '', - name: $('meta[property="og:title"]').attr('content'), - name_alias: $('meta[property="og:video:alias"]').attr('content'), - playResourceUrl: link, - region: $('meta[property="og:video:area"]').attr('content'), - releaseDate: $('meta[property="og:video:release_date"]').attr('content'), - type: $('meta[property="og:video:class"]').attr('content') + nameAlias: $('meta[property="og:video:alias"]').attr('content'), + region: region ? region.split(',') : [], + year: Number($('meta[property="og:video:release_date"]').attr('content')), + category: category, } - let sdata = new SpiderData({ - type: 'movie', - data: obj, - status: 0 - }); - await sdata.save(); - console.log(`@@@@@ ${subLink} @ ${obj.name} saved`); + await Movie.updateOne(idx, record); + console.log(`@@@@@ ${subLink} @ ${record.name} saved`); } } catch (err) { console.log(err); } } -const parseListPage = async (idx) => { - let url; - if (idx === 1) { - url = `https://www.dandanzan.com/dianying/index.html`; - } else { - url = `https://www.dandanzan.com/dianying/index_${idx}.html`; - } - console.log(`begin parse page: ${idx}`); +const parseListPage = async (subPage, category) => { + const url = `${URL_BASE}${subPage}` + console.log(`begin parse category: ${category} page: ${subPage}`); let html; try { html = await netUtil.getData(url, {}) @@ -69,71 +79,79 @@ const parseListPage = async (idx) => { try { generalQueue.addQueue({ run: async function () { - await parseOnePage(page); + await parseOnePage(page, category); } }) } catch (err) { console.log(err); } } - } - console.log(`end parse page: ${idx}`); -} -const parseAllMovie = async () => { - const maxPage = 939; - // const maxPage = 10; - for (let i = 1; i < maxPage; i++) { - try { - await parseListPage(i); - } catch (err) { - console.log(err); - } - } -} -// 将下载的电影导入到正式表中 -const parseAllMovieToDb = async () => { - try { - let list = await SpiderData.find({type: 'movie'}); - for (let obj of list) { - let data = obj.data; - let typeArr = data.type.split(','); - let resourceStr = data.playResourceUrl; - let arr = resourceStr.replace(/\|/g, '#').split('#'); - let resourceArr = []; - for (let str of arr) { - resourceArr.push({ - title: str.substring(0, str.indexOf('$')), - link: str.substring(str.indexOf('$') + 1) - }) + if ($('.next-page')) { + let nextStr = $('.next-page a').attr('href'); + console.log('has next page: ', nextStr); + try { + await parseListPage(nextStr, category); + } catch (err) { + console.log(err); } - let record = new Movie({ - gid: data.id, - name: data.name, - nameAlias: data.name_alias, - type: typeArr, - score: data.grade, - introduce: data.introduce, - language: data.language, - img: data.img, - daoYan: data.daoYan.split(','), - zhuYan: data.zhuYan.split(','), - region: data.region.split(','), - year: data.releaseDate, - sortIdx: 0, - price: 10, - resources: resourceArr, - published: true, - }); - await record.save(); + } else { + console.log('########################### ALL LIST PAGE END ###########################'); } - } catch (err) { - console.log(err); } + + console.log(`end parse category: ${category} page: ${subPage}`); } +const parseAllMovie = async (category) => { + console.time('all'); + let subName = 'dianying'; + switch (category) { + case 'movie': + subName = 'dianying'; + break; + case 'tv': + subName = 'dianshiju' + break; + case 'show': + subName = 'zongyi'; + break; + case 'cartoon': + subName = 'dongman'; + break; + } + const subPage = `/${subName}/index.html` + await parseListPage(subPage, category); +} + export default { run: async () => { - // await parseAllMovie(); - await parseAllMovieToDb(); - console.log('all done'); + // await parseAllMovie('movie'); + // await parseAllMovie('tv'); + // await parseAllMovie('show'); + // await parseAllMovie('cartoon'); + // console.log('all done'); + let html = await netUtil.getData('https://wechat-test.kingsome.cn/', {}) + console.log(html); + // var ProxyLists = require('proxy-lists'); + // + // var options = { + // countries: ['cn'], + // protocols: ['https'], + // }; + // + // var gettingProxies = ProxyLists.getProxies(options); + // gettingProxies.on('data', function(proxies) { + // // Received some proxies. + // console.log(proxies); + // }); + // + // gettingProxies.on('error', function(error) { + // // Some error has occurred. + // // console.error(error); + // }); + // + // gettingProxies.once('end', function() { + // // Done getting proxies. + // console.log('finish get proxy'); + // }); } } diff --git a/src/utils/general.queue.js b/src/utils/general.queue.js index 25d7c21..2102665 100644 --- a/src/utils/general.queue.js +++ b/src/utils/general.queue.js @@ -13,6 +13,7 @@ let q = async.queue( async (reqObj, cb) => { }, 10); q.drain = function(){ console.info('all queue done'); + console.timeEnd('all'); }; module.exports = { addQueue(obj) { diff --git a/src/utils/net.util.js b/src/utils/net.util.js index 70d269f..342636c 100644 --- a/src/utils/net.util.js +++ b/src/utils/net.util.js @@ -1,5 +1,8 @@ import request from 'request'; import Promise from 'bluebird'; +import proxy from './proxys'; +import random_useragent from 'random-useragent'; +let agent = require('socks5-http-client/lib/Agent') const iconv = require('iconv-lite'); @@ -26,6 +29,7 @@ export default { 'Cache-Control': 'no-cache', 'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8', } + header['User-Agent'] = random_useragent.getRandom(); const options = { method: 'POST', url: url, @@ -39,10 +43,17 @@ export default { header = header || { 'Cache-Control': 'no-cache', } + header['User-Agent'] = random_useragent.getRandom(); const options = { method: 'GET', url: url, headers: header, + proxy: proxy.randomProxy(), + // agentClass: agent, + // agentOptions: { + // socksHost: '101.71.41.169', + // socksPort: 43, + // } }; if (encoding) { options.encoding = null; diff --git a/src/utils/proxys.js b/src/utils/proxys.js new file mode 100644 index 0000000..4f93f0e --- /dev/null +++ b/src/utils/proxys.js @@ -0,0 +1,12 @@ +import stringUtil from './string.util'; +const proxys = [ + 'http://101.71.41.169:443', + 'http://116.196.81.58:3128', + 'http://113.200.56.13:8010' + ]; +export default { + + randomProxy() { + return proxys[stringUtil.randomNum(0, proxys.length - 1)]; + } +} diff --git a/src/utils/string.util.js b/src/utils/string.util.js index 78da2dd..a2a3a39 100644 --- a/src/utils/string.util.js +++ b/src/utils/string.util.js @@ -17,7 +17,9 @@ export default { return content.replace(/<.+?>/g, '').replace(/\s/g, ''); } }, - + randomNum(minNum, maxNum) { + return parseInt(Math.random()*(maxNum-minNum+1)+minNum, 10); + }, getContentByReg(str, reg) { const contents = str.match(reg); if (contents && contents.length > 1) {