dandanzan的电影直接更新到movie表
This commit is contained in:
parent
8fcd8d1ae0
commit
20db2bc113
@ -21,6 +21,7 @@
|
||||
"cheerio": "^1.0.0-rc.3",
|
||||
"dateformat": "^3.0.3",
|
||||
"file-stream-rotator": "^0.4.1",
|
||||
"free-proxy": "^0.1.5",
|
||||
"fs": "0.0.1-security",
|
||||
"fs-extra": "^7.0.1",
|
||||
"glob": "^7.1.3",
|
||||
@ -28,8 +29,11 @@
|
||||
"mkdirp": "^0.5.1",
|
||||
"mongoose": "^5.2.15",
|
||||
"multi-progress": "^2.0.0",
|
||||
"proxy-lists": "^1.16.0",
|
||||
"random-useragent": "^0.3.1",
|
||||
"request": "^2.88.0",
|
||||
"request-promise": "^4.2.4",
|
||||
"socks5-http-client": "^1.0.4",
|
||||
"ws": "^6.1.2"
|
||||
},
|
||||
"devDependencies": {
|
||||
|
@ -3,7 +3,7 @@ import mongoose from 'mongoose';
|
||||
const Schema = mongoose.Schema;
|
||||
|
||||
const Movies = new Schema({
|
||||
gid: {type: Number, required: true},
|
||||
gid: {type: Number, required: true, index: true},
|
||||
// 电影名
|
||||
name: {type: String},
|
||||
nameAlias: {type: String},
|
||||
@ -35,9 +35,15 @@ const Movies = new Schema({
|
||||
// 显示的播放数量
|
||||
showCount: {type: Number, default: 1000},
|
||||
// 排序字段, 倒序
|
||||
sortIdx: {type: Number},
|
||||
sortIdx: {type: Number, default: 0},
|
||||
// jc排序字段, 倒序,查询电影时候优先此字段倒序
|
||||
sortJc: {type: Number, default: 0},
|
||||
// 是否公开
|
||||
open: {type: Boolean, default: false},
|
||||
// 购买价格
|
||||
price: {type: Number},
|
||||
price: {type: Number, default: 10},
|
||||
// 视频分类,movie: 电影;tv: 电视剧; show: 综艺节目
|
||||
category: {type: String},
|
||||
// 是否已删除
|
||||
deleted: {type: Boolean, default: false},
|
||||
delete_time: {type: Date},
|
||||
@ -53,4 +59,9 @@ MovieModel.findByGid = function(gid) {
|
||||
return MovieModel.findOne({gid: gid, deleted: false});
|
||||
};
|
||||
|
||||
MovieModel.updateOne = async function (gid, record) {
|
||||
const query = {gid: gid};
|
||||
const options = {upsert: true, setDefaultsOnInsert:true};
|
||||
await MovieModel.update(query, record, options);
|
||||
}
|
||||
export default MovieModel;
|
||||
|
@ -1,57 +1,67 @@
|
||||
import netUtil from "../utils/net.util";
|
||||
import cheerio from "cheerio";
|
||||
import stringUtil from '../utils/string.util';
|
||||
import SpiderData from "../models/SpiderData";
|
||||
import Movie from '../models/Movies';
|
||||
import generalQueue from '../utils/general.queue';
|
||||
|
||||
const parseOnePage = async (subLink) => {
|
||||
// const url = `https://www.dandanzan.com/dianying/${idx}.html`
|
||||
const url = `https://www.dandanzan.com${subLink}`
|
||||
const URL_BASE = 'https://www.dandanzan.com'
|
||||
/**
|
||||
* 处理一个页面
|
||||
* @param {String} subLink
|
||||
* @param {String} category, 种类, movie: 电影, tv: 电视剧, show: 综艺
|
||||
* */
|
||||
const parseOnePage = async (subLink, category) => {
|
||||
const url = `${URL_BASE}${subLink}`
|
||||
try {
|
||||
let html = await netUtil.getData(url, {})
|
||||
const $ = cheerio.load(html);
|
||||
if ($('.error404').text()) {
|
||||
console.log(`>>>>>>>>>>>> ${url} not found`);
|
||||
} else {
|
||||
let link = stringUtil.getContentByReg(html,/links='(.+?)\|'/);
|
||||
let idx = parseInt(subLink.replace('/dianying/', '').replace('.html', ''));
|
||||
let obj = {
|
||||
id: idx,
|
||||
daoYan: $('meta[property="og:video:director"]').attr('content'),
|
||||
zhuYan: $('meta[property="og:video:actor"]').attr('content'),
|
||||
grade: $('meta[property="og:video:score"]').attr('content'),
|
||||
let resourceStr = stringUtil.getContentByReg(html,/links='(.+?)\|'/);
|
||||
let idx = parseInt(subLink.replace('/dianying/', '').replace('.html', '')
|
||||
.replace('/dongman/', '')
|
||||
.replace('/dianshiju/', '')
|
||||
.replace('/zongyi/', ''));
|
||||
const type = $('meta[property="og:video:class"]').attr('content');
|
||||
let typeArr = type ? type.split(',') : [];
|
||||
let arr = resourceStr.replace(/\|/g, '#').split('#');
|
||||
let resourceArr = [];
|
||||
for (let str of arr) {
|
||||
resourceArr.push({
|
||||
title: str.substring(0, str.indexOf('$')),
|
||||
link: str.substring(str.indexOf('$') + 1)
|
||||
})
|
||||
}
|
||||
let daoYan = $('meta[property="og:video:director"]').attr('content');
|
||||
let zhuYan = $('meta[property="og:video:actor"]').attr('content').replace(/ \/ /g, ',');
|
||||
let region = $('meta[property="og:video:area"]').attr('content');
|
||||
let record = {
|
||||
gid: idx,
|
||||
name: $('meta[property="og:title"]').attr('content'),
|
||||
type: typeArr,
|
||||
resources: resourceArr,
|
||||
daoYan: daoYan ? daoYan.split(',') : [],
|
||||
zhuYan: zhuYan ? zhuYan.split(',') : [],
|
||||
score: Number($('meta[property="og:video:score"]').attr('content')),
|
||||
img: $('meta[property="og:image"]').attr('content'),
|
||||
introduce: $('meta[property="og:description"]').attr('content'),
|
||||
language: '',
|
||||
name: $('meta[property="og:title"]').attr('content'),
|
||||
name_alias: $('meta[property="og:video:alias"]').attr('content'),
|
||||
playResourceUrl: link,
|
||||
region: $('meta[property="og:video:area"]').attr('content'),
|
||||
releaseDate: $('meta[property="og:video:release_date"]').attr('content'),
|
||||
type: $('meta[property="og:video:class"]').attr('content')
|
||||
nameAlias: $('meta[property="og:video:alias"]').attr('content'),
|
||||
region: region ? region.split(',') : [],
|
||||
year: Number($('meta[property="og:video:release_date"]').attr('content')),
|
||||
category: category,
|
||||
}
|
||||
let sdata = new SpiderData({
|
||||
type: 'movie',
|
||||
data: obj,
|
||||
status: 0
|
||||
});
|
||||
await sdata.save();
|
||||
console.log(`@@@@@ ${subLink} @ ${obj.name} saved`);
|
||||
await Movie.updateOne(idx, record);
|
||||
console.log(`@@@@@ ${subLink} @ ${record.name} saved`);
|
||||
}
|
||||
} catch (err) {
|
||||
console.log(err);
|
||||
}
|
||||
}
|
||||
|
||||
const parseListPage = async (idx) => {
|
||||
let url;
|
||||
if (idx === 1) {
|
||||
url = `https://www.dandanzan.com/dianying/index.html`;
|
||||
} else {
|
||||
url = `https://www.dandanzan.com/dianying/index_${idx}.html`;
|
||||
}
|
||||
console.log(`begin parse page: ${idx}`);
|
||||
const parseListPage = async (subPage, category) => {
|
||||
const url = `${URL_BASE}${subPage}`
|
||||
console.log(`begin parse category: ${category} page: ${subPage}`);
|
||||
let html;
|
||||
try {
|
||||
html = await netUtil.getData(url, {})
|
||||
@ -69,71 +79,79 @@ const parseListPage = async (idx) => {
|
||||
try {
|
||||
generalQueue.addQueue({
|
||||
run: async function () {
|
||||
await parseOnePage(page);
|
||||
await parseOnePage(page, category);
|
||||
}
|
||||
})
|
||||
} catch (err) {
|
||||
console.log(err);
|
||||
}
|
||||
}
|
||||
}
|
||||
console.log(`end parse page: ${idx}`);
|
||||
}
|
||||
const parseAllMovie = async () => {
|
||||
const maxPage = 939;
|
||||
// const maxPage = 10;
|
||||
for (let i = 1; i < maxPage; i++) {
|
||||
try {
|
||||
await parseListPage(i);
|
||||
} catch (err) {
|
||||
console.log(err);
|
||||
}
|
||||
}
|
||||
}
|
||||
// 将下载的电影导入到正式表中
|
||||
const parseAllMovieToDb = async () => {
|
||||
try {
|
||||
let list = await SpiderData.find({type: 'movie'});
|
||||
for (let obj of list) {
|
||||
let data = obj.data;
|
||||
let typeArr = data.type.split(',');
|
||||
let resourceStr = data.playResourceUrl;
|
||||
let arr = resourceStr.replace(/\|/g, '#').split('#');
|
||||
let resourceArr = [];
|
||||
for (let str of arr) {
|
||||
resourceArr.push({
|
||||
title: str.substring(0, str.indexOf('$')),
|
||||
link: str.substring(str.indexOf('$') + 1)
|
||||
})
|
||||
if ($('.next-page')) {
|
||||
let nextStr = $('.next-page a').attr('href');
|
||||
console.log('has next page: ', nextStr);
|
||||
try {
|
||||
await parseListPage(nextStr, category);
|
||||
} catch (err) {
|
||||
console.log(err);
|
||||
}
|
||||
let record = new Movie({
|
||||
gid: data.id,
|
||||
name: data.name,
|
||||
nameAlias: data.name_alias,
|
||||
type: typeArr,
|
||||
score: data.grade,
|
||||
introduce: data.introduce,
|
||||
language: data.language,
|
||||
img: data.img,
|
||||
daoYan: data.daoYan.split(','),
|
||||
zhuYan: data.zhuYan.split(','),
|
||||
region: data.region.split(','),
|
||||
year: data.releaseDate,
|
||||
sortIdx: 0,
|
||||
price: 10,
|
||||
resources: resourceArr,
|
||||
published: true,
|
||||
});
|
||||
await record.save();
|
||||
} else {
|
||||
console.log('########################### ALL LIST PAGE END ###########################');
|
||||
}
|
||||
} catch (err) {
|
||||
console.log(err);
|
||||
}
|
||||
|
||||
console.log(`end parse category: ${category} page: ${subPage}`);
|
||||
}
|
||||
const parseAllMovie = async (category) => {
|
||||
console.time('all');
|
||||
let subName = 'dianying';
|
||||
switch (category) {
|
||||
case 'movie':
|
||||
subName = 'dianying';
|
||||
break;
|
||||
case 'tv':
|
||||
subName = 'dianshiju'
|
||||
break;
|
||||
case 'show':
|
||||
subName = 'zongyi';
|
||||
break;
|
||||
case 'cartoon':
|
||||
subName = 'dongman';
|
||||
break;
|
||||
}
|
||||
const subPage = `/${subName}/index.html`
|
||||
await parseListPage(subPage, category);
|
||||
}
|
||||
|
||||
export default {
|
||||
run: async () => {
|
||||
// await parseAllMovie();
|
||||
await parseAllMovieToDb();
|
||||
console.log('all done');
|
||||
// await parseAllMovie('movie');
|
||||
// await parseAllMovie('tv');
|
||||
// await parseAllMovie('show');
|
||||
// await parseAllMovie('cartoon');
|
||||
// console.log('all done');
|
||||
let html = await netUtil.getData('https://wechat-test.kingsome.cn/', {})
|
||||
console.log(html);
|
||||
// var ProxyLists = require('proxy-lists');
|
||||
//
|
||||
// var options = {
|
||||
// countries: ['cn'],
|
||||
// protocols: ['https'],
|
||||
// };
|
||||
//
|
||||
// var gettingProxies = ProxyLists.getProxies(options);
|
||||
// gettingProxies.on('data', function(proxies) {
|
||||
// // Received some proxies.
|
||||
// console.log(proxies);
|
||||
// });
|
||||
//
|
||||
// gettingProxies.on('error', function(error) {
|
||||
// // Some error has occurred.
|
||||
// // console.error(error);
|
||||
// });
|
||||
//
|
||||
// gettingProxies.once('end', function() {
|
||||
// // Done getting proxies.
|
||||
// console.log('finish get proxy');
|
||||
// });
|
||||
}
|
||||
}
|
||||
|
@ -13,6 +13,7 @@ let q = async.queue( async (reqObj, cb) => {
|
||||
}, 10);
|
||||
q.drain = function(){
|
||||
console.info('all queue done');
|
||||
console.timeEnd('all');
|
||||
};
|
||||
module.exports = {
|
||||
addQueue(obj) {
|
||||
|
@ -1,5 +1,8 @@
|
||||
import request from 'request';
|
||||
import Promise from 'bluebird';
|
||||
import proxy from './proxys';
|
||||
import random_useragent from 'random-useragent';
|
||||
let agent = require('socks5-http-client/lib/Agent')
|
||||
|
||||
|
||||
const iconv = require('iconv-lite');
|
||||
@ -26,6 +29,7 @@ export default {
|
||||
'Cache-Control': 'no-cache',
|
||||
'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8',
|
||||
}
|
||||
header['User-Agent'] = random_useragent.getRandom();
|
||||
const options = {
|
||||
method: 'POST',
|
||||
url: url,
|
||||
@ -39,10 +43,17 @@ export default {
|
||||
header = header || {
|
||||
'Cache-Control': 'no-cache',
|
||||
}
|
||||
header['User-Agent'] = random_useragent.getRandom();
|
||||
const options = {
|
||||
method: 'GET',
|
||||
url: url,
|
||||
headers: header,
|
||||
proxy: proxy.randomProxy(),
|
||||
// agentClass: agent,
|
||||
// agentOptions: {
|
||||
// socksHost: '101.71.41.169',
|
||||
// socksPort: 43,
|
||||
// }
|
||||
};
|
||||
if (encoding) {
|
||||
options.encoding = null;
|
||||
|
12
src/utils/proxys.js
Normal file
12
src/utils/proxys.js
Normal file
@ -0,0 +1,12 @@
|
||||
import stringUtil from './string.util';
|
||||
const proxys = [
|
||||
'http://101.71.41.169:443',
|
||||
'http://116.196.81.58:3128',
|
||||
'http://113.200.56.13:8010'
|
||||
];
|
||||
export default {
|
||||
|
||||
randomProxy() {
|
||||
return proxys[stringUtil.randomNum(0, proxys.length - 1)];
|
||||
}
|
||||
}
|
@ -17,7 +17,9 @@ export default {
|
||||
return content.replace(/<.+?>/g, '').replace(/\s/g, '');
|
||||
}
|
||||
},
|
||||
|
||||
randomNum(minNum, maxNum) {
|
||||
return parseInt(Math.random()*(maxNum-minNum+1)+minNum, 10);
|
||||
},
|
||||
getContentByReg(str, reg) {
|
||||
const contents = str.match(reg);
|
||||
if (contents && contents.length > 1) {
|
||||
|
Loading…
x
Reference in New Issue
Block a user