spider/src/sites/dandanzan.js

142 lines
4.4 KiB
JavaScript

import netUtil from "../utils/net.util";
import cheerio from "cheerio";
import stringUtil from '../utils/string.util';
import Movie from '../models/Movies';
import generalQueue from '../utils/general.queue';
const URL_BASE = 'https://www.dandanzan.com'
/**
* 处理一个页面
* @param {String} subLink
* @param {String} category, 种类, movie: 电影, tv: 电视剧, show: 综艺
* */
const parseOnePage = async (subLink, category) => {
const url = `${URL_BASE}${subLink}`
try {
let html = await netUtil.getData(url, {})
const $ = cheerio.load(html);
if ($('.error404').text()) {
console.log(`>>>>>>>>>>>> ${url} not found`);
} else {
let resourceStr = stringUtil.getContentByReg(html,/links='(.+?)\|'/);
let idx = parseInt(subLink.replace('/dianying/', '')
.replace('.html', '')
.replace('/dongman/', '')
.replace('/dianshiju/', '')
.replace('/zongyi/', ''));
const type = $('meta[property="og:video:class"]').attr('content');
let typeArr = type ? type.split(',') : [];
let arr = resourceStr.replace(/\|/g, '#').split('#');
let resourceArr = [];
for (let str of arr) {
resourceArr.push({
title: str.substring(0, str.indexOf('$')),
link: str.substring(str.indexOf('$') + 1)
})
}
let daoYan = $('meta[property="og:video:director"]').attr('content');
let zhuYan = $('meta[property="og:video:actor"]').attr('content').replace(/ \/ /g, ',');
let region = $('meta[property="og:video:area"]').attr('content');
let record = await Movie.findByGid(idx);
if (!record) {
record = {
gid: idx,
name: $('meta[property="og:title"]').attr('content'),
type: typeArr,
resources: resourceArr,
daoYan: daoYan ? daoYan.split(',') : [],
zhuYan: zhuYan ? zhuYan.split(',') : [],
score: Number($('meta[property="og:video:score"]').attr('content')),
img: $('meta[property="og:image"]').attr('content'),
introduce: $('meta[property="og:description"]').attr('content'),
nameAlias: $('meta[property="og:video:alias"]').attr('content'),
region: region ? region.split(',') : [],
year: Number($('meta[property="og:video:release_date"]').attr('content')),
category: category,
}
} else {
record.resources = resourceArr;
}
await record.save();
console.log(`@@@@@ ${subLink} @ ${record.name} saved`);
}
} catch (err) {
console.log(err);
}
}
const parseListPage = async (subPage, category) => {
const url = `${URL_BASE}${subPage}`
console.log(`begin parse category: ${category} page: ${subPage}`);
let html;
try {
html = await netUtil.getData(url, {})
} catch (err) {
console.log(err);
}
if (html) {
const $ = cheerio.load(html);
let hrefs = $('.thumbnail');
let pages = [];
$(hrefs).each(function(i, link){
pages.push($(this).attr('href'));
});
for(let page of pages) {
try {
generalQueue.addQueue({
run: async function () {
await parseOnePage(page, category);
}
})
} catch (err) {
console.log(err);
}
}
if ($('.next-page')) {
let nextStr = $('.next-page a').attr('href');
console.log('has next page: ', nextStr);
try {
await parseListPage(nextStr, category);
} catch (err) {
console.log(err);
}
} else {
console.log('########################### ALL LIST PAGE END ###########################');
}
}
console.log(`end parse category: ${category} page: ${subPage}`);
}
const parseAllMovie = async (category) => {
console.time('all');
let subName = 'dianying';
switch (category) {
case 'movie':
subName = 'dianying';
break;
case 'tv':
subName = 'dianshiju'
break;
case 'show':
subName = 'zongyi';
break;
case 'cartoon':
subName = 'dongman';
break;
}
const subPage = `/${subName}/index.html`
await parseListPage(subPage, category);
}
export default {
run: async () => {
await parseAllMovie('movie');
// await parseAllMovie('tv');
// await parseAllMovie('show');
// await parseAllMovie('cartoon');
// console.log('all done');
// let html = await netUtil.getData('https://wechat-test.kingsome.cn/', {})
// console.log(html);
}
}