抓取书本详情使用队列
This commit is contained in:
parent
a83dc46079
commit
53e6efaec9
@ -25,6 +25,7 @@
|
|||||||
"glob": "^7.1.3",
|
"glob": "^7.1.3",
|
||||||
"mkdirp": "^0.5.1",
|
"mkdirp": "^0.5.1",
|
||||||
"mongoose": "^5.2.15",
|
"mongoose": "^5.2.15",
|
||||||
|
"multi-progress": "^2.0.0",
|
||||||
"request": "^2.88.0",
|
"request": "^2.88.0",
|
||||||
"request-promise": "^4.2.4",
|
"request-promise": "^4.2.4",
|
||||||
"ws": "^6.1.2"
|
"ws": "^6.1.2"
|
||||||
|
@ -1,6 +1,10 @@
|
|||||||
import netUtil from '../utils/net.util';
|
import netUtil from '../utils/net.util';
|
||||||
import SpiderCategory from '../models/SpiderCategory';
|
import SpiderCategory from '../models/SpiderCategory';
|
||||||
import SpiderData from '../models/SpiderData';
|
import SpiderData from '../models/SpiderData';
|
||||||
|
import generalQueue from '../utils/general.queue';
|
||||||
|
|
||||||
|
let Multiprogress = require("multi-progress")
|
||||||
|
let multi = new Multiprogress(process.stderr);
|
||||||
|
|
||||||
const generateHeader = () => {
|
const generateHeader = () => {
|
||||||
return {
|
return {
|
||||||
@ -57,10 +61,6 @@ const bookInfo = async (bookId) => {
|
|||||||
//书籍详情
|
//书籍详情
|
||||||
let url = `https://cache.dzjgmp.com/book.api?book_id=${bookId}`;
|
let url = `https://cache.dzjgmp.com/book.api?book_id=${bookId}`;
|
||||||
let data = await netUtil.getData(url, generateHeader());
|
let data = await netUtil.getData(url, generateHeader());
|
||||||
//书籍章节列表
|
|
||||||
let chapterListData = await bookChapterList(bookId, 'kxs2');
|
|
||||||
chapterListData = JSON.parse(chapterListData);
|
|
||||||
console.log(chapterListData);
|
|
||||||
data = JSON.parse(data);
|
data = JSON.parse(data);
|
||||||
return data.data;
|
return data.data;
|
||||||
}
|
}
|
||||||
@ -70,6 +70,7 @@ const bookInfo = async (bookId) => {
|
|||||||
* @param {string} siteId 线路id,从书本详情返回的data.sites中获取
|
* @param {string} siteId 线路id,从书本详情返回的data.sites中获取
|
||||||
* */
|
* */
|
||||||
const bookChapterList = async (bookId, siteId) => {
|
const bookChapterList = async (bookId, siteId) => {
|
||||||
|
console.log(`get chapter list of book: ${bookId}, siteId: ${siteId}`);
|
||||||
let url = `https://cache.dzjgmp.com/book_chapter.api?book_id=${bookId}&site_id=${siteId}`;
|
let url = `https://cache.dzjgmp.com/book_chapter.api?book_id=${bookId}&site_id=${siteId}`;
|
||||||
let data = await netUtil.getData(url, generateHeader());
|
let data = await netUtil.getData(url, generateHeader());
|
||||||
data = JSON.parse(data);
|
data = JSON.parse(data);
|
||||||
@ -83,6 +84,7 @@ const bookChapterList = async (bookId, siteId) => {
|
|||||||
* @param {string} siteId 线路id
|
* @param {string} siteId 线路id
|
||||||
* */
|
* */
|
||||||
const bookChapterInfo = async (chapterId, crawlBookId, listId, siteId) => {
|
const bookChapterInfo = async (chapterId, crawlBookId, listId, siteId) => {
|
||||||
|
// console.log(`get chapter info of chapter: ${chapterId}, crawlBookId: ${crawlBookId}, listId: ${listId}, siteId: ${siteId}`);
|
||||||
let url = `https://cache.dzjgmp.com/chapter.api?chapter_id=${chapterId}&crawl_book_id=${crawlBookId}&id=${listId}&site_id=${siteId}`;
|
let url = `https://cache.dzjgmp.com/chapter.api?chapter_id=${chapterId}&crawl_book_id=${crawlBookId}&id=${listId}&site_id=${siteId}`;
|
||||||
let data = await netUtil.getData(url, generateHeader());
|
let data = await netUtil.getData(url, generateHeader());
|
||||||
data = JSON.parse(data);
|
data = JSON.parse(data);
|
||||||
@ -129,19 +131,70 @@ const parseAllCategory = async (sex, ltype) => {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const parseAllBookList = async () => {
|
/**
|
||||||
let bookList = await SpiderData.find({status: 0}).sort({'data.book_id': 1}).limit(1);
|
* 处理一本书
|
||||||
for(let book of bookList) {
|
* 1. 获取书籍详情
|
||||||
console.time('oneBook');
|
* 2. 获取章节列表
|
||||||
let record = await bookInfo(book.data.book_id);
|
* 3. 获取章节详情
|
||||||
|
* */
|
||||||
|
const parseOneBook = async (book) => {
|
||||||
|
console.log(`begin parse book: ${book.data.book_id}, name: ${book.data.name}`);
|
||||||
|
const bookId = book.data.book_id;
|
||||||
|
let record = await bookInfo(bookId);
|
||||||
for (const key in record) {
|
for (const key in record) {
|
||||||
if ({}.hasOwnProperty.call(record, key)) {
|
if ({}.hasOwnProperty.call(record, key)) {
|
||||||
book.data[key] = record[key];
|
book.data[key] = record[key];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
book.status = 1;
|
||||||
book.markModified('data');
|
book.markModified('data');
|
||||||
console.timeEnd('oneBook')
|
|
||||||
await book.save();
|
await book.save();
|
||||||
|
if (!book.data || !book.data.sites || book.data.sites.length === 0) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
let siteId = book.data.sites[0].site;
|
||||||
|
let chapterList;
|
||||||
|
// 书籍章节列表
|
||||||
|
// 有些源无法获取到章节列表, 所以挨个获取,直到获取到数据
|
||||||
|
for(let site of book.data.sites) {
|
||||||
|
let chapterListData = await bookChapterList(bookId, site.site);
|
||||||
|
if (chapterListData.code === 1) {
|
||||||
|
chapterList = chapterListData.data;
|
||||||
|
site = site.site;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!siteId) {
|
||||||
|
return ;
|
||||||
|
}
|
||||||
|
book.data.chapter_list = chapterList;
|
||||||
|
book.status = 2;
|
||||||
|
book.markModified('data');
|
||||||
|
await book.save();
|
||||||
|
let bar = multi.newBar(' '+book.data.name+' [:bar] :percent :etas', {
|
||||||
|
complete: '=',
|
||||||
|
incomplete: ' ',
|
||||||
|
width: 30,
|
||||||
|
total: chapterList.length
|
||||||
|
});
|
||||||
|
for (let chapter of chapterList) {
|
||||||
|
let obj = await bookChapterInfo(chapter.chapter_id, chapter.crawl_book_id, chapter.lists_id, siteId);
|
||||||
|
bar.tick(1);
|
||||||
|
chapter.content = obj.data.content;
|
||||||
|
}
|
||||||
|
book.status = 3;
|
||||||
|
book.data.chapter_list = chapterList;
|
||||||
|
book.markModified('data');
|
||||||
|
await book.save();
|
||||||
|
}
|
||||||
|
const parseAllBookList = async (start) => {
|
||||||
|
let bookList = await SpiderData.find({status: 0, type: 'book'}).sort({'data.book_id': 1}).limit(15);
|
||||||
|
for(let book of bookList) {
|
||||||
|
generalQueue.addQueue({
|
||||||
|
run: async function () {
|
||||||
|
await parseOneBook(book);
|
||||||
|
}
|
||||||
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -154,10 +207,8 @@ export default {
|
|||||||
// await categoryList(2);
|
// await categoryList(2);
|
||||||
// step 2: 获取所有分类下的书籍列表
|
// step 2: 获取所有分类下的书籍列表
|
||||||
// 获取所有分类下的数据列表
|
// 获取所有分类下的数据列表
|
||||||
await parseAllCategory(2, 0);
|
// await parseAllCategory(2, 0);
|
||||||
// 获取所有数据的详情
|
// 获取所有数据的详情
|
||||||
// await parseAllBookList();
|
await parseAllBookList();
|
||||||
// let data = await bookChapterInfo(9011, 739477, 328, 'kxs2');
|
|
||||||
// console.log(data.data);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
26
src/utils/general.queue.js
Normal file
26
src/utils/general.queue.js
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
import async from 'async';
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 操作队列
|
||||||
|
* */
|
||||||
|
let q = async.queue( async (reqObj, cb) => {
|
||||||
|
try {
|
||||||
|
await reqObj.run();
|
||||||
|
cb();
|
||||||
|
} catch (err) {
|
||||||
|
cb(err);
|
||||||
|
}
|
||||||
|
}, 10);
|
||||||
|
q.drain = function(){
|
||||||
|
console.info('all queue done');
|
||||||
|
};
|
||||||
|
module.exports = {
|
||||||
|
addQueue(obj) {
|
||||||
|
console.log('add obj to queue', obj);
|
||||||
|
q.push(obj, function(err){
|
||||||
|
if (err) {
|
||||||
|
console.log('error parse: ', obj, err);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
Loading…
x
Reference in New Issue
Block a user