diff --git a/package.json b/package.json index 17bbe27..feec136 100644 --- a/package.json +++ b/package.json @@ -25,6 +25,7 @@ "glob": "^7.1.3", "mkdirp": "^0.5.1", "mongoose": "^5.2.15", + "multi-progress": "^2.0.0", "request": "^2.88.0", "request-promise": "^4.2.4", "ws": "^6.1.2" diff --git a/src/sites/book.js b/src/sites/book.js index f40665f..7eaa31f 100644 --- a/src/sites/book.js +++ b/src/sites/book.js @@ -1,6 +1,10 @@ import netUtil from '../utils/net.util'; import SpiderCategory from '../models/SpiderCategory'; import SpiderData from '../models/SpiderData'; +import generalQueue from '../utils/general.queue'; + +let Multiprogress = require("multi-progress") +let multi = new Multiprogress(process.stderr); const generateHeader = () => { return { @@ -57,10 +61,6 @@ const bookInfo = async (bookId) => { //书籍详情 let url = `https://cache.dzjgmp.com/book.api?book_id=${bookId}`; let data = await netUtil.getData(url, generateHeader()); - //书籍章节列表 - let chapterListData = await bookChapterList(bookId, 'kxs2'); - chapterListData = JSON.parse(chapterListData); - console.log(chapterListData); data = JSON.parse(data); return data.data; } @@ -70,6 +70,7 @@ const bookInfo = async (bookId) => { * @param {string} siteId 线路id,从书本详情返回的data.sites中获取 * */ const bookChapterList = async (bookId, siteId) => { + console.log(`get chapter list of book: ${bookId}, siteId: ${siteId}`); let url = `https://cache.dzjgmp.com/book_chapter.api?book_id=${bookId}&site_id=${siteId}`; let data = await netUtil.getData(url, generateHeader()); data = JSON.parse(data); @@ -83,6 +84,7 @@ const bookChapterList = async (bookId, siteId) => { * @param {string} siteId 线路id * */ const bookChapterInfo = async (chapterId, crawlBookId, listId, siteId) => { + // console.log(`get chapter info of chapter: ${chapterId}, crawlBookId: ${crawlBookId}, listId: ${listId}, siteId: ${siteId}`); let url = `https://cache.dzjgmp.com/chapter.api?chapter_id=${chapterId}&crawl_book_id=${crawlBookId}&id=${listId}&site_id=${siteId}`; let data = await netUtil.getData(url, generateHeader()); data = JSON.parse(data); @@ -129,19 +131,70 @@ const parseAllCategory = async (sex, ltype) => { } } -const parseAllBookList = async () => { - let bookList = await SpiderData.find({status: 0}).sort({'data.book_id': 1}).limit(1); - for(let book of bookList) { - console.time('oneBook'); - let record = await bookInfo(book.data.book_id); - for (const key in record) { - if ({}.hasOwnProperty.call(record, key)) { - book.data[key] = record[key]; - } +/** + * 处理一本书 + * 1. 获取书籍详情 + * 2. 获取章节列表 + * 3. 获取章节详情 + * */ +const parseOneBook = async (book) => { + console.log(`begin parse book: ${book.data.book_id}, name: ${book.data.name}`); + const bookId = book.data.book_id; + let record = await bookInfo(bookId); + for (const key in record) { + if ({}.hasOwnProperty.call(record, key)) { + book.data[key] = record[key]; } - book.markModified('data'); - console.timeEnd('oneBook') - await book.save(); + } + book.status = 1; + book.markModified('data'); + await book.save(); + if (!book.data || !book.data.sites || book.data.sites.length === 0) { + return; + } + let siteId = book.data.sites[0].site; + let chapterList; + // 书籍章节列表 + // 有些源无法获取到章节列表, 所以挨个获取,直到获取到数据 + for(let site of book.data.sites) { + let chapterListData = await bookChapterList(bookId, site.site); + if (chapterListData.code === 1) { + chapterList = chapterListData.data; + site = site.site; + break; + } + } + if (!siteId) { + return ; + } + book.data.chapter_list = chapterList; + book.status = 2; + book.markModified('data'); + await book.save(); + let bar = multi.newBar(' '+book.data.name+' [:bar] :percent :etas', { + complete: '=', + incomplete: ' ', + width: 30, + total: chapterList.length + }); + for (let chapter of chapterList) { + let obj = await bookChapterInfo(chapter.chapter_id, chapter.crawl_book_id, chapter.lists_id, siteId); + bar.tick(1); + chapter.content = obj.data.content; + } + book.status = 3; + book.data.chapter_list = chapterList; + book.markModified('data'); + await book.save(); +} +const parseAllBookList = async (start) => { + let bookList = await SpiderData.find({status: 0, type: 'book'}).sort({'data.book_id': 1}).limit(15); + for(let book of bookList) { + generalQueue.addQueue({ + run: async function () { + await parseOneBook(book); + } + }) } } @@ -154,10 +207,8 @@ export default { // await categoryList(2); // step 2: 获取所有分类下的书籍列表 // 获取所有分类下的数据列表 - await parseAllCategory(2, 0); + // await parseAllCategory(2, 0); // 获取所有数据的详情 - // await parseAllBookList(); - // let data = await bookChapterInfo(9011, 739477, 328, 'kxs2'); - // console.log(data.data); + await parseAllBookList(); } } diff --git a/src/utils/general.queue.js b/src/utils/general.queue.js new file mode 100644 index 0000000..9c3df04 --- /dev/null +++ b/src/utils/general.queue.js @@ -0,0 +1,26 @@ +import async from 'async'; + +/** + * 操作队列 + * */ +let q = async.queue( async (reqObj, cb) => { + try { + await reqObj.run(); + cb(); + } catch (err) { + cb(err); + } +}, 10); +q.drain = function(){ + console.info('all queue done'); +}; +module.exports = { + addQueue(obj) { + console.log('add obj to queue', obj); + q.push(obj, function(err){ + if (err) { + console.log('error parse: ', obj, err); + } + }); + } +}