spider/src/sites/dandanzan.js

import cheerio from "cheerio";
import stringUtil from '../utils/string.util';
import Movie from '../models/Movies';
import generalQueue from '../utils/general.queue';
import proxyUtil from '../utils/proxy.util';
import CrawlRecord from '../models/spider/CrawlRecord';

const URL_BASE = 'https://www.dandanzan.com'
const maxIdx = 100000;
/**
 * 处理一个页面
 * @param {String} subLink
 * @param {String} category, 种类， movie: 电影， tv: 电视剧， show: 综艺
 * @param {Number} sortIdx
 * */
const parseOnePage = async ({subLink, category, sortIdx}) => {
  const url = `${URL_BASE}${subLink}`
  let html;
  try {
    html = await proxyUtil.getDataProxy(url)
    await CrawlRecord.updateRecord({url: url,
      className: 'dandanzan',
      methodName: 'parseOnePage',
      params: {subLink, category, sortIdx},
      lastStatus: true,
    })
  } catch(err) {
    console.log('parse page with network error:', url);
    await CrawlRecord.updateRecord({url: url,
      className: 'dandanzan',
      methodName: 'parseOnePage',
      params: {subLink, category, sortIdx},
      lastStatus: false,
    })
  }
  if (!html) {
    return false;
  }
  try {
    const $ = cheerio.load(html);
    if ($('.error404').text()) {
      console.log(`>>>>>>>>>>>> ${url} not found`);
    } else {
      let resourceStr = stringUtil.getContentByReg(html,/links='(.+?)\|'/);
      let idx = parseInt(stringUtil.findByReg(subLink, /\/.+?\/(\d+?)\.html/));
      const type = $('meta[property="og:video:class"]').attr('content');
      let typeArr = type ? type.split(',') : [];
      let arr = resourceStr.replace(/\|/g, '#').split('#');
      let resourceArr = [];
      for (let str of arr) {
        resourceArr.push({
          title: str.substring(0, str.indexOf('$')),
          link: str.substring(str.indexOf('$') + 1)
        })
      }
      let daoYan = $('meta[property="og:video:director"]').attr('content');
      let zhuYan = $('meta[property="og:video:actor"]').attr('content').replace(/ \/ /g, ',');
      let region = $('meta[property="og:video:area"]').attr('content');
      let record = await Movie.findByGid(idx);
      if (!record) {
        record = new Movie({
          gid: idx,
          name: $('meta[property="og:title"]').attr('content'),
          type: typeArr,
          resources: resourceArr,
          daoYan: daoYan ? daoYan.split(',') : [],
          zhuYan: zhuYan ? zhuYan.split(',') : [],
          score: Number($('meta[property="og:video:score"]').attr('content')),
          img: $('meta[property="og:image"]').attr('content'),
          introduce: $('meta[property="og:description"]').attr('content'),
          nameAlias: $('meta[property="og:video:alias"]').attr('content'),
          region: region ? region.split(',') : [],
          year: Number($('meta[property="og:video:release_date"]').attr('content')),
          category: category,
          sortIdx: sortIdx,
          is_new: true,
        })
      } else {
        record.resources = resourceArr;
        record.sortIdx = sortIdx;
      }
      await record.save();
      console.log(`@@@@@ ${sortIdx}: ${subLink} @ ${record.name} saved`);
    }
  } catch (err) {
    console.log(err);
  }
}

const parseListPage = async ({idx, category}) => {
  let subName = 'dianying';
  let index = 0;
  switch (category) {
    case 'movie':
      subName = 'dianying';
      break;
    case 'tv':
      subName = 'dianshiju'
      break;
    case 'show':
      subName = 'zongyi';
      break;
    case 'cartoon':
      subName = 'dongman';
      break;
  }
  const subPage = !idx ? `/${subName}/----onclick.html` : `/${subName}/---${idx}-onclick.html`
  const url = `${URL_BASE}${subPage}`
  console.log(`begin parse category: ${category} page: ${subPage}`);
  let html;
  try {
    html = await proxyUtil.getDataProxy(url)
    await CrawlRecord.updateRecord({url: url,
      className: 'dandanzan',
      methodName: 'parseListPage',
      params: {idx, category},
      lastStatus: true,
    })
  } catch (err) {
    console.log('parse page with network error:', url);
    await CrawlRecord.updateRecord({url: url,
      className: 'dandanzan',
      methodName: 'parseListPage',
      params: {idx, category},
      lastStatus: false,
    })
  }
  if (html) {
    const $ = cheerio.load(html);
    let hrefs = $('.thumbnail');
    let pages = [];
    $(hrefs).each(function(i, link){
      pages.push($(this).attr('href'));
    });
    for(let subLink of pages) {
      try {
        let sortIdx = maxIdx - (idx * 24 + (index ++) );
        generalQueue.addQueue({
          run: async function () {
            await parseOnePage({subLink, category, sortIdx});
          }
        })
      } catch (err) {
        console.log(err);
      }
    }
    if (!idx) {
      const lastPage = $('.pagination ul li').last().find('a').attr('href');
      return parseInt(stringUtil.findByReg(lastPage, /\/.+?\/---(\d+?)-onclick\.html/));
    } else {
      return 1;
    }
  } else {
    return 1;
  }
}
const parseAllMovie = async (category) => {
  console.time('all');
  let allPageNo = await parseListPage({idx: 0, category: category});
  console.log('app page is', allPageNo);
  if (allPageNo > 1) {
    for (let i = 1; i <= allPageNo; i++) {
      await parseListPage({idx: i, category: category});
    }
  }
}

export default {
  run: async () => {
    // await proxy.run();
    await parseAllMovie('movie');
    await parseAllMovie('tv');
    await parseAllMovie('show');
    await parseAllMovie('cartoon');
  }
}