diff --git a/src/app.js b/src/app.js index dea456d..3dee0f4 100644 --- a/src/app.js +++ b/src/app.js @@ -7,6 +7,7 @@ import hoh8 from './sites/hoh8'; import movie from './sites/movie'; import book from './sites/book'; import bookChapter from './sites/bookChapter'; +import dandanzan from './sites/dandanzan'; mongoose.Promise = Promise; @@ -21,8 +22,9 @@ db.once('open', function () { logger.info('Connected to db.'); // hoh8.run(); // book.run(); - movie.run(); + // movie.run(); // bookChapter.run(); + dandanzan.run(); }); mongoose.connect(config.db, {promiseLibrary: Promise, useNewUrlParser: true}); diff --git a/src/models/Movies.js b/src/models/Movies.js index 645ab3d..0482b81 100644 --- a/src/models/Movies.js +++ b/src/models/Movies.js @@ -6,6 +6,7 @@ const Movies = new Schema({ gid: {type: Number, required: true}, // 电影名 name: {type: String}, + name_alias: {type: String}, /** 科幻片: 1 喜剧片: 2 diff --git a/src/sites/dandanzan.js b/src/sites/dandanzan.js new file mode 100644 index 0000000..8d83a3a --- /dev/null +++ b/src/sites/dandanzan.js @@ -0,0 +1,97 @@ +import netUtil from "../utils/net.util"; +import cheerio from "cheerio"; +import stringUtil from '../utils/string.util'; +import SpiderDataModel from "../models/SpiderData"; +import generalQueue from '../utils/general.queue'; + +const parseOnePage = async (subLink) => { + // const url = `https://www.dandanzan.com/dianying/${idx}.html` + const url = `https://www.dandanzan.com${subLink}` + try { + let html = await netUtil.getData(url, {}) + const $ = cheerio.load(html); + if ($('.error404').text()) { + console.log(`>>>>>>>>>>>> ${url} not found`); + } else { + let link = stringUtil.getContentByReg(html,/links='(.+?)\|'/); + let idx = parseInt(subLink.replace('/dianying/', '').replace('.html', '')); + let obj = { + id: idx, + daoYan: $('meta[property="og:video:director"]').attr('content'), + zhuYan: $('meta[property="og:video:actor"]').attr('content'), + grade: $('meta[property="og:video:score"]').attr('content'), + img: $('meta[property="og:image"]').attr('content'), + introduce: $('meta[property="og:description"]').attr('content'), + language: '', + name: $('meta[property="og:title"]').attr('content'), + name_alias: $('meta[property="og:video:alias"]').attr('content'), + playResourceUrl: link, + region: $('meta[property="og:video:area"]').attr('content'), + releaseDate: $('meta[property="og:video:release_date"]').attr('content'), + type: $('meta[property="og:video:class"]').attr('content') + } + let sdata = new SpiderDataModel({ + type: 'movie', + data: obj, + status: 0 + }); + await sdata.save(); + console.log(`@@@@@ ${subLink} @ ${obj.name} saved`); + } + } catch (err) { + console.log(err); + } +} + +const parseListPage = async (idx) => { + let url; + if (idx === 1) { + url = `https://www.dandanzan.com/dianying/index.html`; + } else { + url = `https://www.dandanzan.com/dianying/index_${idx}.html`; + } + console.log(`begin parse page: ${idx}`); + let html; + try { + html = await netUtil.getData(url, {}) + } catch (err) { + console.log(err); + } + if (html) { + const $ = cheerio.load(html); + let hrefs = $('.thumbnail'); + let pages = []; + $(hrefs).each(function(i, link){ + pages.push($(this).attr('href')); + }); + for(let page of pages) { + try { + generalQueue.addQueue({ + run: async function () { + await parseOnePage(page); + } + }) + } catch (err) { + console.log(err); + } + } + } + console.log(`end parse page: ${idx}`); +} +const parseAllMovie = async () => { + const maxPage = 939; + // const maxPage = 10; + for (let i = 1; i < maxPage; i++) { + try { + await parseListPage(i); + } catch (err) { + console.log(err); + } + } +} + +export default { + run: async () => { + await parseAllMovie(); + } +} diff --git a/src/utils/general.queue.js b/src/utils/general.queue.js index 9c3df04..25d7c21 100644 --- a/src/utils/general.queue.js +++ b/src/utils/general.queue.js @@ -16,7 +16,6 @@ q.drain = function(){ }; module.exports = { addQueue(obj) { - console.log('add obj to queue', obj); q.push(obj, function(err){ if (err) { console.log('error parse: ', obj, err); diff --git a/src/utils/string.util.js b/src/utils/string.util.js index 7fe0930..78da2dd 100644 --- a/src/utils/string.util.js +++ b/src/utils/string.util.js @@ -17,4 +17,13 @@ export default { return content.replace(/<.+?>/g, '').replace(/\s/g, ''); } }, + + getContentByReg(str, reg) { + const contents = str.match(reg); + if (contents && contents.length > 1) { + return contents[1] + } else { + return ''; + } + } }