抓取书本详情使用队列

This commit is contained in:
zhl 2019-04-03 16:04:17 +08:00
parent a83dc46079
commit 53e6efaec9
3 changed files with 98 additions and 20 deletions

View File

@ -25,6 +25,7 @@
"glob": "^7.1.3", "glob": "^7.1.3",
"mkdirp": "^0.5.1", "mkdirp": "^0.5.1",
"mongoose": "^5.2.15", "mongoose": "^5.2.15",
"multi-progress": "^2.0.0",
"request": "^2.88.0", "request": "^2.88.0",
"request-promise": "^4.2.4", "request-promise": "^4.2.4",
"ws": "^6.1.2" "ws": "^6.1.2"

View File

@ -1,6 +1,10 @@
import netUtil from '../utils/net.util'; import netUtil from '../utils/net.util';
import SpiderCategory from '../models/SpiderCategory'; import SpiderCategory from '../models/SpiderCategory';
import SpiderData from '../models/SpiderData'; import SpiderData from '../models/SpiderData';
import generalQueue from '../utils/general.queue';
let Multiprogress = require("multi-progress")
let multi = new Multiprogress(process.stderr);
const generateHeader = () => { const generateHeader = () => {
return { return {
@ -57,10 +61,6 @@ const bookInfo = async (bookId) => {
//书籍详情 //书籍详情
let url = `https://cache.dzjgmp.com/book.api?book_id=${bookId}`; let url = `https://cache.dzjgmp.com/book.api?book_id=${bookId}`;
let data = await netUtil.getData(url, generateHeader()); let data = await netUtil.getData(url, generateHeader());
//书籍章节列表
let chapterListData = await bookChapterList(bookId, 'kxs2');
chapterListData = JSON.parse(chapterListData);
console.log(chapterListData);
data = JSON.parse(data); data = JSON.parse(data);
return data.data; return data.data;
} }
@ -70,6 +70,7 @@ const bookInfo = async (bookId) => {
* @param {string} siteId 线路id从书本详情返回的data.sites中获取 * @param {string} siteId 线路id从书本详情返回的data.sites中获取
* */ * */
const bookChapterList = async (bookId, siteId) => { const bookChapterList = async (bookId, siteId) => {
console.log(`get chapter list of book: ${bookId}, siteId: ${siteId}`);
let url = `https://cache.dzjgmp.com/book_chapter.api?book_id=${bookId}&site_id=${siteId}`; let url = `https://cache.dzjgmp.com/book_chapter.api?book_id=${bookId}&site_id=${siteId}`;
let data = await netUtil.getData(url, generateHeader()); let data = await netUtil.getData(url, generateHeader());
data = JSON.parse(data); data = JSON.parse(data);
@ -83,6 +84,7 @@ const bookChapterList = async (bookId, siteId) => {
* @param {string} siteId 线路id * @param {string} siteId 线路id
* */ * */
const bookChapterInfo = async (chapterId, crawlBookId, listId, siteId) => { const bookChapterInfo = async (chapterId, crawlBookId, listId, siteId) => {
// console.log(`get chapter info of chapter: ${chapterId}, crawlBookId: ${crawlBookId}, listId: ${listId}, siteId: ${siteId}`);
let url = `https://cache.dzjgmp.com/chapter.api?chapter_id=${chapterId}&crawl_book_id=${crawlBookId}&id=${listId}&site_id=${siteId}`; let url = `https://cache.dzjgmp.com/chapter.api?chapter_id=${chapterId}&crawl_book_id=${crawlBookId}&id=${listId}&site_id=${siteId}`;
let data = await netUtil.getData(url, generateHeader()); let data = await netUtil.getData(url, generateHeader());
data = JSON.parse(data); data = JSON.parse(data);
@ -129,19 +131,70 @@ const parseAllCategory = async (sex, ltype) => {
} }
} }
const parseAllBookList = async () => { /**
let bookList = await SpiderData.find({status: 0}).sort({'data.book_id': 1}).limit(1); * 处理一本书
for(let book of bookList) { * 1. 获取书籍详情
console.time('oneBook'); * 2. 获取章节列表
let record = await bookInfo(book.data.book_id); * 3. 获取章节详情
* */
const parseOneBook = async (book) => {
console.log(`begin parse book: ${book.data.book_id}, name: ${book.data.name}`);
const bookId = book.data.book_id;
let record = await bookInfo(bookId);
for (const key in record) { for (const key in record) {
if ({}.hasOwnProperty.call(record, key)) { if ({}.hasOwnProperty.call(record, key)) {
book.data[key] = record[key]; book.data[key] = record[key];
} }
} }
book.status = 1;
book.markModified('data'); book.markModified('data');
console.timeEnd('oneBook')
await book.save(); await book.save();
if (!book.data || !book.data.sites || book.data.sites.length === 0) {
return;
}
let siteId = book.data.sites[0].site;
let chapterList;
// 书籍章节列表
// 有些源无法获取到章节列表, 所以挨个获取,直到获取到数据
for(let site of book.data.sites) {
let chapterListData = await bookChapterList(bookId, site.site);
if (chapterListData.code === 1) {
chapterList = chapterListData.data;
site = site.site;
break;
}
}
if (!siteId) {
return ;
}
book.data.chapter_list = chapterList;
book.status = 2;
book.markModified('data');
await book.save();
let bar = multi.newBar(' '+book.data.name+' [:bar] :percent :etas', {
complete: '=',
incomplete: ' ',
width: 30,
total: chapterList.length
});
for (let chapter of chapterList) {
let obj = await bookChapterInfo(chapter.chapter_id, chapter.crawl_book_id, chapter.lists_id, siteId);
bar.tick(1);
chapter.content = obj.data.content;
}
book.status = 3;
book.data.chapter_list = chapterList;
book.markModified('data');
await book.save();
}
const parseAllBookList = async (start) => {
let bookList = await SpiderData.find({status: 0, type: 'book'}).sort({'data.book_id': 1}).limit(15);
for(let book of bookList) {
generalQueue.addQueue({
run: async function () {
await parseOneBook(book);
}
})
} }
} }
@ -154,10 +207,8 @@ export default {
// await categoryList(2); // await categoryList(2);
// step 2: 获取所有分类下的书籍列表 // step 2: 获取所有分类下的书籍列表
// 获取所有分类下的数据列表 // 获取所有分类下的数据列表
await parseAllCategory(2, 0); // await parseAllCategory(2, 0);
// 获取所有数据的详情 // 获取所有数据的详情
// await parseAllBookList(); await parseAllBookList();
// let data = await bookChapterInfo(9011, 739477, 328, 'kxs2');
// console.log(data.data);
} }
} }

View File

@ -0,0 +1,26 @@
import async from 'async';
/**
* 操作队列
* */
let q = async.queue( async (reqObj, cb) => {
try {
await reqObj.run();
cb();
} catch (err) {
cb(err);
}
}, 10);
q.drain = function(){
console.info('all queue done');
};
module.exports = {
addQueue(obj) {
console.log('add obj to queue', obj);
q.push(obj, function(err){
if (err) {
console.log('error parse: ', obj, err);
}
});
}
}