添加蛋蛋赞网站的抓取
This commit is contained in:
parent
4b3b58c508
commit
dcceb648d1
@ -7,6 +7,7 @@ import hoh8 from './sites/hoh8';
|
||||
import movie from './sites/movie';
|
||||
import book from './sites/book';
|
||||
import bookChapter from './sites/bookChapter';
|
||||
import dandanzan from './sites/dandanzan';
|
||||
|
||||
mongoose.Promise = Promise;
|
||||
|
||||
@ -21,8 +22,9 @@ db.once('open', function () {
|
||||
logger.info('Connected to db.');
|
||||
// hoh8.run();
|
||||
// book.run();
|
||||
movie.run();
|
||||
// movie.run();
|
||||
// bookChapter.run();
|
||||
dandanzan.run();
|
||||
});
|
||||
mongoose.connect(config.db, {promiseLibrary: Promise, useNewUrlParser: true});
|
||||
|
||||
|
@ -6,6 +6,7 @@ const Movies = new Schema({
|
||||
gid: {type: Number, required: true},
|
||||
// 电影名
|
||||
name: {type: String},
|
||||
name_alias: {type: String},
|
||||
/**
|
||||
科幻片: 1
|
||||
喜剧片: 2
|
||||
|
97
src/sites/dandanzan.js
Normal file
97
src/sites/dandanzan.js
Normal file
@ -0,0 +1,97 @@
|
||||
import netUtil from "../utils/net.util";
|
||||
import cheerio from "cheerio";
|
||||
import stringUtil from '../utils/string.util';
|
||||
import SpiderDataModel from "../models/SpiderData";
|
||||
import generalQueue from '../utils/general.queue';
|
||||
|
||||
const parseOnePage = async (subLink) => {
|
||||
// const url = `https://www.dandanzan.com/dianying/${idx}.html`
|
||||
const url = `https://www.dandanzan.com${subLink}`
|
||||
try {
|
||||
let html = await netUtil.getData(url, {})
|
||||
const $ = cheerio.load(html);
|
||||
if ($('.error404').text()) {
|
||||
console.log(`>>>>>>>>>>>> ${url} not found`);
|
||||
} else {
|
||||
let link = stringUtil.getContentByReg(html,/links='(.+?)\|'/);
|
||||
let idx = parseInt(subLink.replace('/dianying/', '').replace('.html', ''));
|
||||
let obj = {
|
||||
id: idx,
|
||||
daoYan: $('meta[property="og:video:director"]').attr('content'),
|
||||
zhuYan: $('meta[property="og:video:actor"]').attr('content'),
|
||||
grade: $('meta[property="og:video:score"]').attr('content'),
|
||||
img: $('meta[property="og:image"]').attr('content'),
|
||||
introduce: $('meta[property="og:description"]').attr('content'),
|
||||
language: '',
|
||||
name: $('meta[property="og:title"]').attr('content'),
|
||||
name_alias: $('meta[property="og:video:alias"]').attr('content'),
|
||||
playResourceUrl: link,
|
||||
region: $('meta[property="og:video:area"]').attr('content'),
|
||||
releaseDate: $('meta[property="og:video:release_date"]').attr('content'),
|
||||
type: $('meta[property="og:video:class"]').attr('content')
|
||||
}
|
||||
let sdata = new SpiderDataModel({
|
||||
type: 'movie',
|
||||
data: obj,
|
||||
status: 0
|
||||
});
|
||||
await sdata.save();
|
||||
console.log(`@@@@@ ${subLink} @ ${obj.name} saved`);
|
||||
}
|
||||
} catch (err) {
|
||||
console.log(err);
|
||||
}
|
||||
}
|
||||
|
||||
const parseListPage = async (idx) => {
|
||||
let url;
|
||||
if (idx === 1) {
|
||||
url = `https://www.dandanzan.com/dianying/index.html`;
|
||||
} else {
|
||||
url = `https://www.dandanzan.com/dianying/index_${idx}.html`;
|
||||
}
|
||||
console.log(`begin parse page: ${idx}`);
|
||||
let html;
|
||||
try {
|
||||
html = await netUtil.getData(url, {})
|
||||
} catch (err) {
|
||||
console.log(err);
|
||||
}
|
||||
if (html) {
|
||||
const $ = cheerio.load(html);
|
||||
let hrefs = $('.thumbnail');
|
||||
let pages = [];
|
||||
$(hrefs).each(function(i, link){
|
||||
pages.push($(this).attr('href'));
|
||||
});
|
||||
for(let page of pages) {
|
||||
try {
|
||||
generalQueue.addQueue({
|
||||
run: async function () {
|
||||
await parseOnePage(page);
|
||||
}
|
||||
})
|
||||
} catch (err) {
|
||||
console.log(err);
|
||||
}
|
||||
}
|
||||
}
|
||||
console.log(`end parse page: ${idx}`);
|
||||
}
|
||||
const parseAllMovie = async () => {
|
||||
const maxPage = 939;
|
||||
// const maxPage = 10;
|
||||
for (let i = 1; i < maxPage; i++) {
|
||||
try {
|
||||
await parseListPage(i);
|
||||
} catch (err) {
|
||||
console.log(err);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
export default {
|
||||
run: async () => {
|
||||
await parseAllMovie();
|
||||
}
|
||||
}
|
@ -16,7 +16,6 @@ q.drain = function(){
|
||||
};
|
||||
module.exports = {
|
||||
addQueue(obj) {
|
||||
console.log('add obj to queue', obj);
|
||||
q.push(obj, function(err){
|
||||
if (err) {
|
||||
console.log('error parse: ', obj, err);
|
||||
|
@ -17,4 +17,13 @@ export default {
|
||||
return content.replace(/<.+?>/g, '').replace(/\s/g, '');
|
||||
}
|
||||
},
|
||||
|
||||
getContentByReg(str, reg) {
|
||||
const contents = str.match(reg);
|
||||
if (contents && contents.length > 1) {
|
||||
return contents[1]
|
||||
} else {
|
||||
return '';
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user