177 lines
5.4 KiB
JavaScript
177 lines
5.4 KiB
JavaScript
import cheerio from "cheerio";
|
|
import stringUtil from '../utils/string.util';
|
|
import Movie from '../models/Movies';
|
|
import generalQueue from '../utils/general.queue';
|
|
import proxyUtil from '../utils/proxy.util';
|
|
import CrawlRecord from '../models/spider/CrawlRecord';
|
|
|
|
const URL_BASE = 'https://www.dandanzan.com'
|
|
const maxIdx = 100000;
|
|
/**
|
|
* 处理一个页面
|
|
* @param {String} subLink
|
|
* @param {String} category, 种类, movie: 电影, tv: 电视剧, show: 综艺
|
|
* @param {Number} sortIdx
|
|
* */
|
|
const parseOnePage = async ({subLink, category, sortIdx}) => {
|
|
const url = `${URL_BASE}${subLink}`
|
|
let html;
|
|
try {
|
|
html = await proxyUtil.getDataProxy(url)
|
|
await CrawlRecord.updateRecord({url: url,
|
|
className: 'dandanzan',
|
|
methodName: 'parseOnePage',
|
|
params: {subLink, category, sortIdx},
|
|
lastStatus: true,
|
|
})
|
|
} catch(err) {
|
|
console.log('parse page with network error:', url);
|
|
await CrawlRecord.updateRecord({url: url,
|
|
className: 'dandanzan',
|
|
methodName: 'parseOnePage',
|
|
params: {subLink, category, sortIdx},
|
|
lastStatus: false,
|
|
})
|
|
}
|
|
if (!html) {
|
|
return false;
|
|
}
|
|
try {
|
|
const $ = cheerio.load(html);
|
|
if ($('.error404').text()) {
|
|
console.log(`>>>>>>>>>>>> ${url} not found`);
|
|
} else {
|
|
let resourceStr = stringUtil.getContentByReg(html,/links='(.+?)\|'/);
|
|
let idx = parseInt(stringUtil.findByReg(subLink, /\/.+?\/(\d+?)\.html/));
|
|
const type = $('meta[property="og:video:class"]').attr('content');
|
|
let typeArr = type ? type.split(',') : [];
|
|
let arr = resourceStr.replace(/\|/g, '#').split('#');
|
|
let resourceArr = [];
|
|
for (let str of arr) {
|
|
resourceArr.push({
|
|
title: str.substring(0, str.indexOf('$')),
|
|
link: str.substring(str.indexOf('$') + 1)
|
|
})
|
|
}
|
|
let daoYan = $('meta[property="og:video:director"]').attr('content');
|
|
let zhuYan = $('meta[property="og:video:actor"]').attr('content').replace(/ \/ /g, ',');
|
|
let region = $('meta[property="og:video:area"]').attr('content');
|
|
let record = await Movie.findByGid(idx);
|
|
if (!record) {
|
|
record = new Movie({
|
|
gid: idx,
|
|
name: $('meta[property="og:title"]').attr('content'),
|
|
type: typeArr,
|
|
resources: resourceArr,
|
|
daoYan: daoYan ? daoYan.split(',') : [],
|
|
zhuYan: zhuYan ? zhuYan.split(',') : [],
|
|
score: Number($('meta[property="og:video:score"]').attr('content')),
|
|
img: $('meta[property="og:image"]').attr('content'),
|
|
introduce: $('meta[property="og:description"]').attr('content'),
|
|
nameAlias: $('meta[property="og:video:alias"]').attr('content'),
|
|
region: region ? region.split(',') : [],
|
|
year: Number($('meta[property="og:video:release_date"]').attr('content')),
|
|
category: category,
|
|
sortIdx: sortIdx,
|
|
is_new: true,
|
|
})
|
|
} else {
|
|
record.resources = resourceArr;
|
|
record.sortIdx = sortIdx;
|
|
}
|
|
await record.save();
|
|
console.log(`@@@@@ ${sortIdx}: ${subLink} @ ${record.name} saved`);
|
|
}
|
|
} catch (err) {
|
|
console.log(err);
|
|
}
|
|
}
|
|
|
|
const parseListPage = async ({idx, category}) => {
|
|
let subName = 'dianying';
|
|
let index = 0;
|
|
switch (category) {
|
|
case 'movie':
|
|
subName = 'dianying';
|
|
break;
|
|
case 'tv':
|
|
subName = 'dianshiju'
|
|
break;
|
|
case 'show':
|
|
subName = 'zongyi';
|
|
break;
|
|
case 'cartoon':
|
|
subName = 'dongman';
|
|
break;
|
|
}
|
|
const subPage = !idx ? `/${subName}/----onclick.html` : `/${subName}/---${idx}-onclick.html`
|
|
const url = `${URL_BASE}${subPage}`
|
|
console.log(`begin parse category: ${category} page: ${subPage}`);
|
|
let html;
|
|
try {
|
|
html = await proxyUtil.getDataProxy(url)
|
|
await CrawlRecord.updateRecord({url: url,
|
|
className: 'dandanzan',
|
|
methodName: 'parseListPage',
|
|
params: {idx, category},
|
|
lastStatus: true,
|
|
})
|
|
} catch (err) {
|
|
console.log('parse page with network error:', url);
|
|
await CrawlRecord.updateRecord({url: url,
|
|
className: 'dandanzan',
|
|
methodName: 'parseListPage',
|
|
params: {idx, category},
|
|
lastStatus: false,
|
|
})
|
|
}
|
|
if (html) {
|
|
const $ = cheerio.load(html);
|
|
let hrefs = $('.thumbnail');
|
|
let pages = [];
|
|
$(hrefs).each(function(i, link){
|
|
pages.push($(this).attr('href'));
|
|
});
|
|
for(let subLink of pages) {
|
|
try {
|
|
let sortIdx = maxIdx - (idx * 24 + (index ++) );
|
|
generalQueue.addQueue({
|
|
run: async function () {
|
|
await parseOnePage({subLink, category, sortIdx});
|
|
}
|
|
})
|
|
} catch (err) {
|
|
console.log(err);
|
|
}
|
|
}
|
|
if (!idx) {
|
|
const lastPage = $('.pagination ul li').last().find('a').attr('href');
|
|
return parseInt(stringUtil.findByReg(lastPage, /\/.+?\/---(\d+?)-onclick\.html/));
|
|
} else {
|
|
return 1;
|
|
}
|
|
} else {
|
|
return 1;
|
|
}
|
|
}
|
|
const parseAllMovie = async (category) => {
|
|
console.time('all');
|
|
let allPageNo = await parseListPage({idx: 0, category: category});
|
|
console.log('app page is', allPageNo);
|
|
if (allPageNo > 1) {
|
|
for (let i = 1; i <= allPageNo; i++) {
|
|
await parseListPage({idx: i, category: category});
|
|
}
|
|
}
|
|
}
|
|
|
|
export default {
|
|
run: async () => {
|
|
// await proxy.run();
|
|
await parseAllMovie('movie');
|
|
await parseAllMovie('tv');
|
|
await parseAllMovie('show');
|
|
await parseAllMovie('cartoon');
|
|
}
|
|
}
|