添加蛋蛋赞网站的抓取

This commit is contained in:
zhl 2019-04-26 19:03:00 +08:00
parent 4b3b58c508
commit dcceb648d1
5 changed files with 110 additions and 2 deletions

View File

@ -7,6 +7,7 @@ import hoh8 from './sites/hoh8';
import movie from './sites/movie';
import book from './sites/book';
import bookChapter from './sites/bookChapter';
import dandanzan from './sites/dandanzan';
mongoose.Promise = Promise;
@ -21,8 +22,9 @@ db.once('open', function () {
logger.info('Connected to db.');
// hoh8.run();
// book.run();
movie.run();
// movie.run();
// bookChapter.run();
dandanzan.run();
});
mongoose.connect(config.db, {promiseLibrary: Promise, useNewUrlParser: true});

View File

@ -6,6 +6,7 @@ const Movies = new Schema({
gid: {type: Number, required: true},
// 电影名
name: {type: String},
name_alias: {type: String},
/**
科幻片: 1
喜剧片: 2

97
src/sites/dandanzan.js Normal file
View File

@ -0,0 +1,97 @@
import netUtil from "../utils/net.util";
import cheerio from "cheerio";
import stringUtil from '../utils/string.util';
import SpiderDataModel from "../models/SpiderData";
import generalQueue from '../utils/general.queue';
const parseOnePage = async (subLink) => {
// const url = `https://www.dandanzan.com/dianying/${idx}.html`
const url = `https://www.dandanzan.com${subLink}`
try {
let html = await netUtil.getData(url, {})
const $ = cheerio.load(html);
if ($('.error404').text()) {
console.log(`>>>>>>>>>>>> ${url} not found`);
} else {
let link = stringUtil.getContentByReg(html,/links='(.+?)\|'/);
let idx = parseInt(subLink.replace('/dianying/', '').replace('.html', ''));
let obj = {
id: idx,
daoYan: $('meta[property="og:video:director"]').attr('content'),
zhuYan: $('meta[property="og:video:actor"]').attr('content'),
grade: $('meta[property="og:video:score"]').attr('content'),
img: $('meta[property="og:image"]').attr('content'),
introduce: $('meta[property="og:description"]').attr('content'),
language: '',
name: $('meta[property="og:title"]').attr('content'),
name_alias: $('meta[property="og:video:alias"]').attr('content'),
playResourceUrl: link,
region: $('meta[property="og:video:area"]').attr('content'),
releaseDate: $('meta[property="og:video:release_date"]').attr('content'),
type: $('meta[property="og:video:class"]').attr('content')
}
let sdata = new SpiderDataModel({
type: 'movie',
data: obj,
status: 0
});
await sdata.save();
console.log(`@@@@@ ${subLink} @ ${obj.name} saved`);
}
} catch (err) {
console.log(err);
}
}
const parseListPage = async (idx) => {
let url;
if (idx === 1) {
url = `https://www.dandanzan.com/dianying/index.html`;
} else {
url = `https://www.dandanzan.com/dianying/index_${idx}.html`;
}
console.log(`begin parse page: ${idx}`);
let html;
try {
html = await netUtil.getData(url, {})
} catch (err) {
console.log(err);
}
if (html) {
const $ = cheerio.load(html);
let hrefs = $('.thumbnail');
let pages = [];
$(hrefs).each(function(i, link){
pages.push($(this).attr('href'));
});
for(let page of pages) {
try {
generalQueue.addQueue({
run: async function () {
await parseOnePage(page);
}
})
} catch (err) {
console.log(err);
}
}
}
console.log(`end parse page: ${idx}`);
}
const parseAllMovie = async () => {
const maxPage = 939;
// const maxPage = 10;
for (let i = 1; i < maxPage; i++) {
try {
await parseListPage(i);
} catch (err) {
console.log(err);
}
}
}
export default {
run: async () => {
await parseAllMovie();
}
}

View File

@ -16,7 +16,6 @@ q.drain = function(){
};
module.exports = {
addQueue(obj) {
console.log('add obj to queue', obj);
q.push(obj, function(err){
if (err) {
console.log('error parse: ', obj, err);

View File

@ -17,4 +17,13 @@ export default {
return content.replace(/<.+?>/g, '').replace(/\s/g, '');
}
},
getContentByReg(str, reg) {
const contents = str.match(reg);
if (contents && contents.length > 1) {
return contents[1]
} else {
return '';
}
}
}