import netUtil from "../utils/net.util"; import cheerio from "cheerio"; import stringUtil from '../utils/string.util'; import Movie from '../models/Movies'; import generalQueue from '../utils/general.queue'; const URL_BASE = 'https://www.dandanzan.com' /** * 处理一个页面 * @param {String} subLink * @param {String} category, 种类, movie: 电影, tv: 电视剧, show: 综艺 * */ const parseOnePage = async (subLink, category) => { const url = `${URL_BASE}${subLink}` try { let html = await netUtil.getData(url, {}) const $ = cheerio.load(html); if ($('.error404').text()) { console.log(`>>>>>>>>>>>> ${url} not found`); } else { let resourceStr = stringUtil.getContentByReg(html,/links='(.+?)\|'/); let idx = parseInt(subLink.replace('/dianying/', '') .replace('.html', '') .replace('/dongman/', '') .replace('/dianshiju/', '') .replace('/zongyi/', '')); const type = $('meta[property="og:video:class"]').attr('content'); let typeArr = type ? type.split(',') : []; let arr = resourceStr.replace(/\|/g, '#').split('#'); let resourceArr = []; for (let str of arr) { resourceArr.push({ title: str.substring(0, str.indexOf('$')), link: str.substring(str.indexOf('$') + 1) }) } let daoYan = $('meta[property="og:video:director"]').attr('content'); let zhuYan = $('meta[property="og:video:actor"]').attr('content').replace(/ \/ /g, ','); let region = $('meta[property="og:video:area"]').attr('content'); let record = await Movie.findByGid(idx); if (!record) { record = { gid: idx, name: $('meta[property="og:title"]').attr('content'), type: typeArr, resources: resourceArr, daoYan: daoYan ? daoYan.split(',') : [], zhuYan: zhuYan ? zhuYan.split(',') : [], score: Number($('meta[property="og:video:score"]').attr('content')), img: $('meta[property="og:image"]').attr('content'), introduce: $('meta[property="og:description"]').attr('content'), nameAlias: $('meta[property="og:video:alias"]').attr('content'), region: region ? region.split(',') : [], year: Number($('meta[property="og:video:release_date"]').attr('content')), category: category, } } else { record.resources = resourceArr; } await record.save(); console.log(`@@@@@ ${subLink} @ ${record.name} saved`); } } catch (err) { console.log(err); } } const parseListPage = async (subPage, category) => { const url = `${URL_BASE}${subPage}` console.log(`begin parse category: ${category} page: ${subPage}`); let html; try { html = await netUtil.getData(url, {}) } catch (err) { console.log(err); } if (html) { const $ = cheerio.load(html); let hrefs = $('.thumbnail'); let pages = []; $(hrefs).each(function(i, link){ pages.push($(this).attr('href')); }); for(let page of pages) { try { generalQueue.addQueue({ run: async function () { await parseOnePage(page, category); } }) } catch (err) { console.log(err); } } if ($('.next-page')) { let nextStr = $('.next-page a').attr('href'); console.log('has next page: ', nextStr); try { await parseListPage(nextStr, category); } catch (err) { console.log(err); } } else { console.log('########################### ALL LIST PAGE END ###########################'); } } console.log(`end parse category: ${category} page: ${subPage}`); } const parseAllMovie = async (category) => { console.time('all'); let subName = 'dianying'; switch (category) { case 'movie': subName = 'dianying'; break; case 'tv': subName = 'dianshiju' break; case 'show': subName = 'zongyi'; break; case 'cartoon': subName = 'dongman'; break; } const subPage = `/${subName}/index.html` await parseListPage(subPage, category); } export default { run: async () => { await parseAllMovie('movie'); // await parseAllMovie('tv'); // await parseAllMovie('show'); // await parseAllMovie('cartoon'); // console.log('all done'); // let html = await netUtil.getData('https://wechat-test.kingsome.cn/', {}) // console.log(html); } }