dandanzan的电影直接更新到movie表

This commit is contained in:
zhl 2019-05-07 17:06:28 +08:00
parent 8fcd8d1ae0
commit 20db2bc113
7 changed files with 150 additions and 91 deletions

View File

@ -21,6 +21,7 @@
"cheerio": "^1.0.0-rc.3",
"dateformat": "^3.0.3",
"file-stream-rotator": "^0.4.1",
"free-proxy": "^0.1.5",
"fs": "0.0.1-security",
"fs-extra": "^7.0.1",
"glob": "^7.1.3",
@ -28,8 +29,11 @@
"mkdirp": "^0.5.1",
"mongoose": "^5.2.15",
"multi-progress": "^2.0.0",
"proxy-lists": "^1.16.0",
"random-useragent": "^0.3.1",
"request": "^2.88.0",
"request-promise": "^4.2.4",
"socks5-http-client": "^1.0.4",
"ws": "^6.1.2"
},
"devDependencies": {

View File

@ -3,7 +3,7 @@ import mongoose from 'mongoose';
const Schema = mongoose.Schema;
const Movies = new Schema({
gid: {type: Number, required: true},
gid: {type: Number, required: true, index: true},
// 电影名
name: {type: String},
nameAlias: {type: String},
@ -35,9 +35,15 @@ const Movies = new Schema({
// 显示的播放数量
showCount: {type: Number, default: 1000},
// 排序字段, 倒序
sortIdx: {type: Number},
sortIdx: {type: Number, default: 0},
// jc排序字段 倒序,查询电影时候优先此字段倒序
sortJc: {type: Number, default: 0},
// 是否公开
open: {type: Boolean, default: false},
// 购买价格
price: {type: Number},
price: {type: Number, default: 10},
// 视频分类movie: 电影tv: 电视剧; show: 综艺节目
category: {type: String},
// 是否已删除
deleted: {type: Boolean, default: false},
delete_time: {type: Date},
@ -53,4 +59,9 @@ MovieModel.findByGid = function(gid) {
return MovieModel.findOne({gid: gid, deleted: false});
};
MovieModel.updateOne = async function (gid, record) {
const query = {gid: gid};
const options = {upsert: true, setDefaultsOnInsert:true};
await MovieModel.update(query, record, options);
}
export default MovieModel;

View File

@ -1,57 +1,67 @@
import netUtil from "../utils/net.util";
import cheerio from "cheerio";
import stringUtil from '../utils/string.util';
import SpiderData from "../models/SpiderData";
import Movie from '../models/Movies';
import generalQueue from '../utils/general.queue';
const parseOnePage = async (subLink) => {
// const url = `https://www.dandanzan.com/dianying/${idx}.html`
const url = `https://www.dandanzan.com${subLink}`
const URL_BASE = 'https://www.dandanzan.com'
/**
* 处理一个页面
* @param {String} subLink
* @param {String} category, 种类 movie: 电影 tv: 电视剧 show: 综艺
* */
const parseOnePage = async (subLink, category) => {
const url = `${URL_BASE}${subLink}`
try {
let html = await netUtil.getData(url, {})
const $ = cheerio.load(html);
if ($('.error404').text()) {
console.log(`>>>>>>>>>>>> ${url} not found`);
} else {
let link = stringUtil.getContentByReg(html,/links='(.+?)\|'/);
let idx = parseInt(subLink.replace('/dianying/', '').replace('.html', ''));
let obj = {
id: idx,
daoYan: $('meta[property="og:video:director"]').attr('content'),
zhuYan: $('meta[property="og:video:actor"]').attr('content'),
grade: $('meta[property="og:video:score"]').attr('content'),
let resourceStr = stringUtil.getContentByReg(html,/links='(.+?)\|'/);
let idx = parseInt(subLink.replace('/dianying/', '').replace('.html', '')
.replace('/dongman/', '')
.replace('/dianshiju/', '')
.replace('/zongyi/', ''));
const type = $('meta[property="og:video:class"]').attr('content');
let typeArr = type ? type.split(',') : [];
let arr = resourceStr.replace(/\|/g, '#').split('#');
let resourceArr = [];
for (let str of arr) {
resourceArr.push({
title: str.substring(0, str.indexOf('$')),
link: str.substring(str.indexOf('$') + 1)
})
}
let daoYan = $('meta[property="og:video:director"]').attr('content');
let zhuYan = $('meta[property="og:video:actor"]').attr('content').replace(/ \/ /g, ',');
let region = $('meta[property="og:video:area"]').attr('content');
let record = {
gid: idx,
name: $('meta[property="og:title"]').attr('content'),
type: typeArr,
resources: resourceArr,
daoYan: daoYan ? daoYan.split(',') : [],
zhuYan: zhuYan ? zhuYan.split(',') : [],
score: Number($('meta[property="og:video:score"]').attr('content')),
img: $('meta[property="og:image"]').attr('content'),
introduce: $('meta[property="og:description"]').attr('content'),
language: '',
name: $('meta[property="og:title"]').attr('content'),
name_alias: $('meta[property="og:video:alias"]').attr('content'),
playResourceUrl: link,
region: $('meta[property="og:video:area"]').attr('content'),
releaseDate: $('meta[property="og:video:release_date"]').attr('content'),
type: $('meta[property="og:video:class"]').attr('content')
nameAlias: $('meta[property="og:video:alias"]').attr('content'),
region: region ? region.split(',') : [],
year: Number($('meta[property="og:video:release_date"]').attr('content')),
category: category,
}
let sdata = new SpiderData({
type: 'movie',
data: obj,
status: 0
});
await sdata.save();
console.log(`@@@@@ ${subLink} @ ${obj.name} saved`);
await Movie.updateOne(idx, record);
console.log(`@@@@@ ${subLink} @ ${record.name} saved`);
}
} catch (err) {
console.log(err);
}
}
const parseListPage = async (idx) => {
let url;
if (idx === 1) {
url = `https://www.dandanzan.com/dianying/index.html`;
} else {
url = `https://www.dandanzan.com/dianying/index_${idx}.html`;
}
console.log(`begin parse page: ${idx}`);
const parseListPage = async (subPage, category) => {
const url = `${URL_BASE}${subPage}`
console.log(`begin parse category: ${category} page: ${subPage}`);
let html;
try {
html = await netUtil.getData(url, {})
@ -69,71 +79,79 @@ const parseListPage = async (idx) => {
try {
generalQueue.addQueue({
run: async function () {
await parseOnePage(page);
await parseOnePage(page, category);
}
})
} catch (err) {
console.log(err);
}
}
}
console.log(`end parse page: ${idx}`);
}
const parseAllMovie = async () => {
const maxPage = 939;
// const maxPage = 10;
for (let i = 1; i < maxPage; i++) {
try {
await parseListPage(i);
} catch (err) {
console.log(err);
}
}
}
// 将下载的电影导入到正式表中
const parseAllMovieToDb = async () => {
try {
let list = await SpiderData.find({type: 'movie'});
for (let obj of list) {
let data = obj.data;
let typeArr = data.type.split(',');
let resourceStr = data.playResourceUrl;
let arr = resourceStr.replace(/\|/g, '#').split('#');
let resourceArr = [];
for (let str of arr) {
resourceArr.push({
title: str.substring(0, str.indexOf('$')),
link: str.substring(str.indexOf('$') + 1)
})
if ($('.next-page')) {
let nextStr = $('.next-page a').attr('href');
console.log('has next page: ', nextStr);
try {
await parseListPage(nextStr, category);
} catch (err) {
console.log(err);
}
let record = new Movie({
gid: data.id,
name: data.name,
nameAlias: data.name_alias,
type: typeArr,
score: data.grade,
introduce: data.introduce,
language: data.language,
img: data.img,
daoYan: data.daoYan.split(','),
zhuYan: data.zhuYan.split(','),
region: data.region.split(','),
year: data.releaseDate,
sortIdx: 0,
price: 10,
resources: resourceArr,
published: true,
});
await record.save();
} else {
console.log('########################### ALL LIST PAGE END ###########################');
}
} catch (err) {
console.log(err);
}
console.log(`end parse category: ${category} page: ${subPage}`);
}
const parseAllMovie = async (category) => {
console.time('all');
let subName = 'dianying';
switch (category) {
case 'movie':
subName = 'dianying';
break;
case 'tv':
subName = 'dianshiju'
break;
case 'show':
subName = 'zongyi';
break;
case 'cartoon':
subName = 'dongman';
break;
}
const subPage = `/${subName}/index.html`
await parseListPage(subPage, category);
}
export default {
run: async () => {
// await parseAllMovie();
await parseAllMovieToDb();
console.log('all done');
// await parseAllMovie('movie');
// await parseAllMovie('tv');
// await parseAllMovie('show');
// await parseAllMovie('cartoon');
// console.log('all done');
let html = await netUtil.getData('https://wechat-test.kingsome.cn/', {})
console.log(html);
// var ProxyLists = require('proxy-lists');
//
// var options = {
// countries: ['cn'],
// protocols: ['https'],
// };
//
// var gettingProxies = ProxyLists.getProxies(options);
// gettingProxies.on('data', function(proxies) {
// // Received some proxies.
// console.log(proxies);
// });
//
// gettingProxies.on('error', function(error) {
// // Some error has occurred.
// // console.error(error);
// });
//
// gettingProxies.once('end', function() {
// // Done getting proxies.
// console.log('finish get proxy');
// });
}
}

View File

@ -13,6 +13,7 @@ let q = async.queue( async (reqObj, cb) => {
}, 10);
q.drain = function(){
console.info('all queue done');
console.timeEnd('all');
};
module.exports = {
addQueue(obj) {

View File

@ -1,5 +1,8 @@
import request from 'request';
import Promise from 'bluebird';
import proxy from './proxys';
import random_useragent from 'random-useragent';
let agent = require('socks5-http-client/lib/Agent')
const iconv = require('iconv-lite');
@ -26,6 +29,7 @@ export default {
'Cache-Control': 'no-cache',
'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8',
}
header['User-Agent'] = random_useragent.getRandom();
const options = {
method: 'POST',
url: url,
@ -39,10 +43,17 @@ export default {
header = header || {
'Cache-Control': 'no-cache',
}
header['User-Agent'] = random_useragent.getRandom();
const options = {
method: 'GET',
url: url,
headers: header,
proxy: proxy.randomProxy(),
// agentClass: agent,
// agentOptions: {
// socksHost: '101.71.41.169',
// socksPort: 43,
// }
};
if (encoding) {
options.encoding = null;

12
src/utils/proxys.js Normal file
View File

@ -0,0 +1,12 @@
import stringUtil from './string.util';
const proxys = [
'http://101.71.41.169:443',
'http://116.196.81.58:3128',
'http://113.200.56.13:8010'
];
export default {
randomProxy() {
return proxys[stringUtil.randomNum(0, proxys.length - 1)];
}
}

View File

@ -17,7 +17,9 @@ export default {
return content.replace(/<.+?>/g, '').replace(/\s/g, '');
}
},
randomNum(minNum, maxNum) {
return parseInt(Math.random()*(maxNum-minNum+1)+minNum, 10);
},
getContentByReg(str, reg) {
const contents = str.match(reg);
if (contents && contents.length > 1) {