dandanzan的电影直接更新到movie表
This commit is contained in:
parent
8fcd8d1ae0
commit
20db2bc113
@ -21,6 +21,7 @@
|
|||||||
"cheerio": "^1.0.0-rc.3",
|
"cheerio": "^1.0.0-rc.3",
|
||||||
"dateformat": "^3.0.3",
|
"dateformat": "^3.0.3",
|
||||||
"file-stream-rotator": "^0.4.1",
|
"file-stream-rotator": "^0.4.1",
|
||||||
|
"free-proxy": "^0.1.5",
|
||||||
"fs": "0.0.1-security",
|
"fs": "0.0.1-security",
|
||||||
"fs-extra": "^7.0.1",
|
"fs-extra": "^7.0.1",
|
||||||
"glob": "^7.1.3",
|
"glob": "^7.1.3",
|
||||||
@ -28,8 +29,11 @@
|
|||||||
"mkdirp": "^0.5.1",
|
"mkdirp": "^0.5.1",
|
||||||
"mongoose": "^5.2.15",
|
"mongoose": "^5.2.15",
|
||||||
"multi-progress": "^2.0.0",
|
"multi-progress": "^2.0.0",
|
||||||
|
"proxy-lists": "^1.16.0",
|
||||||
|
"random-useragent": "^0.3.1",
|
||||||
"request": "^2.88.0",
|
"request": "^2.88.0",
|
||||||
"request-promise": "^4.2.4",
|
"request-promise": "^4.2.4",
|
||||||
|
"socks5-http-client": "^1.0.4",
|
||||||
"ws": "^6.1.2"
|
"ws": "^6.1.2"
|
||||||
},
|
},
|
||||||
"devDependencies": {
|
"devDependencies": {
|
||||||
|
@ -3,7 +3,7 @@ import mongoose from 'mongoose';
|
|||||||
const Schema = mongoose.Schema;
|
const Schema = mongoose.Schema;
|
||||||
|
|
||||||
const Movies = new Schema({
|
const Movies = new Schema({
|
||||||
gid: {type: Number, required: true},
|
gid: {type: Number, required: true, index: true},
|
||||||
// 电影名
|
// 电影名
|
||||||
name: {type: String},
|
name: {type: String},
|
||||||
nameAlias: {type: String},
|
nameAlias: {type: String},
|
||||||
@ -35,9 +35,15 @@ const Movies = new Schema({
|
|||||||
// 显示的播放数量
|
// 显示的播放数量
|
||||||
showCount: {type: Number, default: 1000},
|
showCount: {type: Number, default: 1000},
|
||||||
// 排序字段, 倒序
|
// 排序字段, 倒序
|
||||||
sortIdx: {type: Number},
|
sortIdx: {type: Number, default: 0},
|
||||||
|
// jc排序字段, 倒序,查询电影时候优先此字段倒序
|
||||||
|
sortJc: {type: Number, default: 0},
|
||||||
|
// 是否公开
|
||||||
|
open: {type: Boolean, default: false},
|
||||||
// 购买价格
|
// 购买价格
|
||||||
price: {type: Number},
|
price: {type: Number, default: 10},
|
||||||
|
// 视频分类,movie: 电影;tv: 电视剧; show: 综艺节目
|
||||||
|
category: {type: String},
|
||||||
// 是否已删除
|
// 是否已删除
|
||||||
deleted: {type: Boolean, default: false},
|
deleted: {type: Boolean, default: false},
|
||||||
delete_time: {type: Date},
|
delete_time: {type: Date},
|
||||||
@ -53,4 +59,9 @@ MovieModel.findByGid = function(gid) {
|
|||||||
return MovieModel.findOne({gid: gid, deleted: false});
|
return MovieModel.findOne({gid: gid, deleted: false});
|
||||||
};
|
};
|
||||||
|
|
||||||
|
MovieModel.updateOne = async function (gid, record) {
|
||||||
|
const query = {gid: gid};
|
||||||
|
const options = {upsert: true, setDefaultsOnInsert:true};
|
||||||
|
await MovieModel.update(query, record, options);
|
||||||
|
}
|
||||||
export default MovieModel;
|
export default MovieModel;
|
||||||
|
@ -1,57 +1,67 @@
|
|||||||
import netUtil from "../utils/net.util";
|
import netUtil from "../utils/net.util";
|
||||||
import cheerio from "cheerio";
|
import cheerio from "cheerio";
|
||||||
import stringUtil from '../utils/string.util';
|
import stringUtil from '../utils/string.util';
|
||||||
import SpiderData from "../models/SpiderData";
|
|
||||||
import Movie from '../models/Movies';
|
import Movie from '../models/Movies';
|
||||||
import generalQueue from '../utils/general.queue';
|
import generalQueue from '../utils/general.queue';
|
||||||
|
|
||||||
const parseOnePage = async (subLink) => {
|
const URL_BASE = 'https://www.dandanzan.com'
|
||||||
// const url = `https://www.dandanzan.com/dianying/${idx}.html`
|
/**
|
||||||
const url = `https://www.dandanzan.com${subLink}`
|
* 处理一个页面
|
||||||
|
* @param {String} subLink
|
||||||
|
* @param {String} category, 种类, movie: 电影, tv: 电视剧, show: 综艺
|
||||||
|
* */
|
||||||
|
const parseOnePage = async (subLink, category) => {
|
||||||
|
const url = `${URL_BASE}${subLink}`
|
||||||
try {
|
try {
|
||||||
let html = await netUtil.getData(url, {})
|
let html = await netUtil.getData(url, {})
|
||||||
const $ = cheerio.load(html);
|
const $ = cheerio.load(html);
|
||||||
if ($('.error404').text()) {
|
if ($('.error404').text()) {
|
||||||
console.log(`>>>>>>>>>>>> ${url} not found`);
|
console.log(`>>>>>>>>>>>> ${url} not found`);
|
||||||
} else {
|
} else {
|
||||||
let link = stringUtil.getContentByReg(html,/links='(.+?)\|'/);
|
let resourceStr = stringUtil.getContentByReg(html,/links='(.+?)\|'/);
|
||||||
let idx = parseInt(subLink.replace('/dianying/', '').replace('.html', ''));
|
let idx = parseInt(subLink.replace('/dianying/', '').replace('.html', '')
|
||||||
let obj = {
|
.replace('/dongman/', '')
|
||||||
id: idx,
|
.replace('/dianshiju/', '')
|
||||||
daoYan: $('meta[property="og:video:director"]').attr('content'),
|
.replace('/zongyi/', ''));
|
||||||
zhuYan: $('meta[property="og:video:actor"]').attr('content'),
|
const type = $('meta[property="og:video:class"]').attr('content');
|
||||||
grade: $('meta[property="og:video:score"]').attr('content'),
|
let typeArr = type ? type.split(',') : [];
|
||||||
|
let arr = resourceStr.replace(/\|/g, '#').split('#');
|
||||||
|
let resourceArr = [];
|
||||||
|
for (let str of arr) {
|
||||||
|
resourceArr.push({
|
||||||
|
title: str.substring(0, str.indexOf('$')),
|
||||||
|
link: str.substring(str.indexOf('$') + 1)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
let daoYan = $('meta[property="og:video:director"]').attr('content');
|
||||||
|
let zhuYan = $('meta[property="og:video:actor"]').attr('content').replace(/ \/ /g, ',');
|
||||||
|
let region = $('meta[property="og:video:area"]').attr('content');
|
||||||
|
let record = {
|
||||||
|
gid: idx,
|
||||||
|
name: $('meta[property="og:title"]').attr('content'),
|
||||||
|
type: typeArr,
|
||||||
|
resources: resourceArr,
|
||||||
|
daoYan: daoYan ? daoYan.split(',') : [],
|
||||||
|
zhuYan: zhuYan ? zhuYan.split(',') : [],
|
||||||
|
score: Number($('meta[property="og:video:score"]').attr('content')),
|
||||||
img: $('meta[property="og:image"]').attr('content'),
|
img: $('meta[property="og:image"]').attr('content'),
|
||||||
introduce: $('meta[property="og:description"]').attr('content'),
|
introduce: $('meta[property="og:description"]').attr('content'),
|
||||||
language: '',
|
nameAlias: $('meta[property="og:video:alias"]').attr('content'),
|
||||||
name: $('meta[property="og:title"]').attr('content'),
|
region: region ? region.split(',') : [],
|
||||||
name_alias: $('meta[property="og:video:alias"]').attr('content'),
|
year: Number($('meta[property="og:video:release_date"]').attr('content')),
|
||||||
playResourceUrl: link,
|
category: category,
|
||||||
region: $('meta[property="og:video:area"]').attr('content'),
|
|
||||||
releaseDate: $('meta[property="og:video:release_date"]').attr('content'),
|
|
||||||
type: $('meta[property="og:video:class"]').attr('content')
|
|
||||||
}
|
}
|
||||||
let sdata = new SpiderData({
|
await Movie.updateOne(idx, record);
|
||||||
type: 'movie',
|
console.log(`@@@@@ ${subLink} @ ${record.name} saved`);
|
||||||
data: obj,
|
|
||||||
status: 0
|
|
||||||
});
|
|
||||||
await sdata.save();
|
|
||||||
console.log(`@@@@@ ${subLink} @ ${obj.name} saved`);
|
|
||||||
}
|
}
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
console.log(err);
|
console.log(err);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const parseListPage = async (idx) => {
|
const parseListPage = async (subPage, category) => {
|
||||||
let url;
|
const url = `${URL_BASE}${subPage}`
|
||||||
if (idx === 1) {
|
console.log(`begin parse category: ${category} page: ${subPage}`);
|
||||||
url = `https://www.dandanzan.com/dianying/index.html`;
|
|
||||||
} else {
|
|
||||||
url = `https://www.dandanzan.com/dianying/index_${idx}.html`;
|
|
||||||
}
|
|
||||||
console.log(`begin parse page: ${idx}`);
|
|
||||||
let html;
|
let html;
|
||||||
try {
|
try {
|
||||||
html = await netUtil.getData(url, {})
|
html = await netUtil.getData(url, {})
|
||||||
@ -69,71 +79,79 @@ const parseListPage = async (idx) => {
|
|||||||
try {
|
try {
|
||||||
generalQueue.addQueue({
|
generalQueue.addQueue({
|
||||||
run: async function () {
|
run: async function () {
|
||||||
await parseOnePage(page);
|
await parseOnePage(page, category);
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
console.log(err);
|
console.log(err);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
if ($('.next-page')) {
|
||||||
console.log(`end parse page: ${idx}`);
|
let nextStr = $('.next-page a').attr('href');
|
||||||
}
|
console.log('has next page: ', nextStr);
|
||||||
const parseAllMovie = async () => {
|
|
||||||
const maxPage = 939;
|
|
||||||
// const maxPage = 10;
|
|
||||||
for (let i = 1; i < maxPage; i++) {
|
|
||||||
try {
|
try {
|
||||||
await parseListPage(i);
|
await parseListPage(nextStr, category);
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
console.log(err);
|
console.log(err);
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
console.log('########################### ALL LIST PAGE END ###########################');
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log(`end parse category: ${category} page: ${subPage}`);
|
||||||
}
|
}
|
||||||
// 将下载的电影导入到正式表中
|
const parseAllMovie = async (category) => {
|
||||||
const parseAllMovieToDb = async () => {
|
console.time('all');
|
||||||
try {
|
let subName = 'dianying';
|
||||||
let list = await SpiderData.find({type: 'movie'});
|
switch (category) {
|
||||||
for (let obj of list) {
|
case 'movie':
|
||||||
let data = obj.data;
|
subName = 'dianying';
|
||||||
let typeArr = data.type.split(',');
|
break;
|
||||||
let resourceStr = data.playResourceUrl;
|
case 'tv':
|
||||||
let arr = resourceStr.replace(/\|/g, '#').split('#');
|
subName = 'dianshiju'
|
||||||
let resourceArr = [];
|
break;
|
||||||
for (let str of arr) {
|
case 'show':
|
||||||
resourceArr.push({
|
subName = 'zongyi';
|
||||||
title: str.substring(0, str.indexOf('$')),
|
break;
|
||||||
link: str.substring(str.indexOf('$') + 1)
|
case 'cartoon':
|
||||||
})
|
subName = 'dongman';
|
||||||
}
|
break;
|
||||||
let record = new Movie({
|
|
||||||
gid: data.id,
|
|
||||||
name: data.name,
|
|
||||||
nameAlias: data.name_alias,
|
|
||||||
type: typeArr,
|
|
||||||
score: data.grade,
|
|
||||||
introduce: data.introduce,
|
|
||||||
language: data.language,
|
|
||||||
img: data.img,
|
|
||||||
daoYan: data.daoYan.split(','),
|
|
||||||
zhuYan: data.zhuYan.split(','),
|
|
||||||
region: data.region.split(','),
|
|
||||||
year: data.releaseDate,
|
|
||||||
sortIdx: 0,
|
|
||||||
price: 10,
|
|
||||||
resources: resourceArr,
|
|
||||||
published: true,
|
|
||||||
});
|
|
||||||
await record.save();
|
|
||||||
}
|
|
||||||
} catch (err) {
|
|
||||||
console.log(err);
|
|
||||||
}
|
}
|
||||||
|
const subPage = `/${subName}/index.html`
|
||||||
|
await parseListPage(subPage, category);
|
||||||
}
|
}
|
||||||
|
|
||||||
export default {
|
export default {
|
||||||
run: async () => {
|
run: async () => {
|
||||||
// await parseAllMovie();
|
// await parseAllMovie('movie');
|
||||||
await parseAllMovieToDb();
|
// await parseAllMovie('tv');
|
||||||
console.log('all done');
|
// await parseAllMovie('show');
|
||||||
|
// await parseAllMovie('cartoon');
|
||||||
|
// console.log('all done');
|
||||||
|
let html = await netUtil.getData('https://wechat-test.kingsome.cn/', {})
|
||||||
|
console.log(html);
|
||||||
|
// var ProxyLists = require('proxy-lists');
|
||||||
|
//
|
||||||
|
// var options = {
|
||||||
|
// countries: ['cn'],
|
||||||
|
// protocols: ['https'],
|
||||||
|
// };
|
||||||
|
//
|
||||||
|
// var gettingProxies = ProxyLists.getProxies(options);
|
||||||
|
// gettingProxies.on('data', function(proxies) {
|
||||||
|
// // Received some proxies.
|
||||||
|
// console.log(proxies);
|
||||||
|
// });
|
||||||
|
//
|
||||||
|
// gettingProxies.on('error', function(error) {
|
||||||
|
// // Some error has occurred.
|
||||||
|
// // console.error(error);
|
||||||
|
// });
|
||||||
|
//
|
||||||
|
// gettingProxies.once('end', function() {
|
||||||
|
// // Done getting proxies.
|
||||||
|
// console.log('finish get proxy');
|
||||||
|
// });
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -13,6 +13,7 @@ let q = async.queue( async (reqObj, cb) => {
|
|||||||
}, 10);
|
}, 10);
|
||||||
q.drain = function(){
|
q.drain = function(){
|
||||||
console.info('all queue done');
|
console.info('all queue done');
|
||||||
|
console.timeEnd('all');
|
||||||
};
|
};
|
||||||
module.exports = {
|
module.exports = {
|
||||||
addQueue(obj) {
|
addQueue(obj) {
|
||||||
|
@ -1,5 +1,8 @@
|
|||||||
import request from 'request';
|
import request from 'request';
|
||||||
import Promise from 'bluebird';
|
import Promise from 'bluebird';
|
||||||
|
import proxy from './proxys';
|
||||||
|
import random_useragent from 'random-useragent';
|
||||||
|
let agent = require('socks5-http-client/lib/Agent')
|
||||||
|
|
||||||
|
|
||||||
const iconv = require('iconv-lite');
|
const iconv = require('iconv-lite');
|
||||||
@ -26,6 +29,7 @@ export default {
|
|||||||
'Cache-Control': 'no-cache',
|
'Cache-Control': 'no-cache',
|
||||||
'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8',
|
'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8',
|
||||||
}
|
}
|
||||||
|
header['User-Agent'] = random_useragent.getRandom();
|
||||||
const options = {
|
const options = {
|
||||||
method: 'POST',
|
method: 'POST',
|
||||||
url: url,
|
url: url,
|
||||||
@ -39,10 +43,17 @@ export default {
|
|||||||
header = header || {
|
header = header || {
|
||||||
'Cache-Control': 'no-cache',
|
'Cache-Control': 'no-cache',
|
||||||
}
|
}
|
||||||
|
header['User-Agent'] = random_useragent.getRandom();
|
||||||
const options = {
|
const options = {
|
||||||
method: 'GET',
|
method: 'GET',
|
||||||
url: url,
|
url: url,
|
||||||
headers: header,
|
headers: header,
|
||||||
|
proxy: proxy.randomProxy(),
|
||||||
|
// agentClass: agent,
|
||||||
|
// agentOptions: {
|
||||||
|
// socksHost: '101.71.41.169',
|
||||||
|
// socksPort: 43,
|
||||||
|
// }
|
||||||
};
|
};
|
||||||
if (encoding) {
|
if (encoding) {
|
||||||
options.encoding = null;
|
options.encoding = null;
|
||||||
|
12
src/utils/proxys.js
Normal file
12
src/utils/proxys.js
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
import stringUtil from './string.util';
|
||||||
|
const proxys = [
|
||||||
|
'http://101.71.41.169:443',
|
||||||
|
'http://116.196.81.58:3128',
|
||||||
|
'http://113.200.56.13:8010'
|
||||||
|
];
|
||||||
|
export default {
|
||||||
|
|
||||||
|
randomProxy() {
|
||||||
|
return proxys[stringUtil.randomNum(0, proxys.length - 1)];
|
||||||
|
}
|
||||||
|
}
|
@ -17,7 +17,9 @@ export default {
|
|||||||
return content.replace(/<.+?>/g, '').replace(/\s/g, '');
|
return content.replace(/<.+?>/g, '').replace(/\s/g, '');
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
randomNum(minNum, maxNum) {
|
||||||
|
return parseInt(Math.random()*(maxNum-minNum+1)+minNum, 10);
|
||||||
|
},
|
||||||
getContentByReg(str, reg) {
|
getContentByReg(str, reg) {
|
||||||
const contents = str.match(reg);
|
const contents = str.match(reg);
|
||||||
if (contents && contents.length > 1) {
|
if (contents && contents.length > 1) {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user