From 875994edf9b5ec26268499892dc5eee04d5391a5 Mon Sep 17 00:00:00 2001 From: zhl Date: Wed, 3 Apr 2019 13:21:30 +0800 Subject: [PATCH] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E5=A4=84=E7=90=86=E5=88=97?= =?UTF-8?q?=E8=A1=A8=E6=97=B6=E7=9A=84=E9=94=99=E8=AF=AF=E5=A4=84=E7=90=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 3 ++ docs/qmxs.md | 31 ++++++++++++-- src/sites/book.js | 99 +++++++++++++++++++++++++++++++------------ src/utils/net.util.js | 3 ++ 4 files changed, 107 insertions(+), 29 deletions(-) diff --git a/README.md b/README.md index 936b88d..6cee8bf 100644 --- a/README.md +++ b/README.md @@ -5,3 +5,6 @@ sites/hoh8.js +## 20190403 全民小说抓取 + +sites/book.js diff --git a/docs/qmxs.md b/docs/qmxs.md index 5176b43..70da0d4 100644 --- a/docs/qmxs.md +++ b/docs/qmxs.md @@ -10,7 +10,7 @@ https://bc.dzjgmp.com/core/book/type.api?sex=1 Response: -``` +```json { code: 1, data: [ { ltype_id: 0, @@ -36,7 +36,7 @@ https://bc.dzjgmp.com/core/book/type_all.api?sex=1<ype=0&stype=-1&status=-1&pa Response: -``` +```json { code: 1, data: { total: 34248, @@ -51,7 +51,7 @@ Response: ``` 单本书的结构 -``` +```json { book_id: 668, name: '逆天邪神', author: '火星引力', @@ -73,6 +73,31 @@ https://cache.dzjgmp.com/book.api?book_id=10887 https://cache.dzjgmp.com/book_chapter.api?book_id=10887&site_id=wsg +Response: + +```json +{ code: 1, + data: + [ { lists_id: 328, + chapter_id: 9011, + num: 0, + name: '第一章 小镇少年', + crawl_book_id: 739477, + url: 'http://www.2kxs.com/xiaoshuo/22/22295/6632341.html' }, + ] +} +``` + ## 章节详情 https://cache.dzjgmp.com/chapter.api?chapter_id=19873&crawl_book_id=839413&id=418&site_id=wsg + +Response: + +```json +{ chapter_id: 9011, + name: '第一章 小镇少年', + crawl_book_id: 739477, + content: '' +} +``` diff --git a/src/sites/book.js b/src/sites/book.js index 8c411bf..d8794a7 100644 --- a/src/sites/book.js +++ b/src/sites/book.js @@ -38,19 +38,57 @@ const categoryList = async (sex) => { const bookList = async (sex, ltype, page) => { console.log(`get book list, sex: ${sex}, ltype: ${ltype}, page: ${page}`); let url = `https://bc.dzjgmp.com/core/book/type_all.api?sex=${sex}<ype=${ltype}&stype=-1&status=-1&page=${page}&limit=20`; + try { + let data = await netUtil.getData(url, generateHeader()); + data = JSON.parse(data); + return data; + } catch (err) { + console.log(err); + return {data: {list: []}}; + } +} + +/** + * 单本书籍的详情 + * @param {string} bookId 书籍id + * */ +const bookInfo = async (bookId) => { + console.log(`parse book: ${bookId}`); + //书籍详情 + let url = `https://cache.dzjgmp.com/book.api?book_id=${bookId}`; + let data = await netUtil.getData(url, generateHeader()); + //书籍章节列表 + let chapterListData = await bookChapterList(bookId, 'kxs2'); + chapterListData = JSON.parse(chapterListData); + console.log(chapterListData); + data = JSON.parse(data); + return data.data; +} +/** + * 获取书本章节列表 + * @param {string} bookId 书本id + * @param {string} siteId 线路id,从书本详情返回的data.sites中获取 + * */ +const bookChapterList = async (bookId, siteId) => { + let url = `https://cache.dzjgmp.com/book_chapter.api?book_id=${bookId}&site_id=${siteId}`; let data = await netUtil.getData(url, generateHeader()); data = JSON.parse(data); return data; } - -const bookInfo = async (bookId) => { - console.log(`parse book: ${bookId}`); - let url = `https://cache.dzjgmp.com/book.api?book_id=${bookId}`; +/** + * 获取单个章节的详情 + * @param {number} chapterId 章节id + * @param {number} crawlBookId 从章节列表返回数据中的 data.crawl_book_id 获取 + * @param {number} listId 从章节列表返回数据中的 data.lists_id 获取 + * @param {string} siteId 线路id + * */ +const bookChapterInfo = async (chapterId, crawlBookId, listId, siteId) => { + let url = `https://cache.dzjgmp.com/chapter.api?chapter_id=${chapterId}&crawl_book_id=${crawlBookId}&id=${listId}&site_id=${siteId}`; let data = await netUtil.getData(url, generateHeader()); data = JSON.parse(data); - return data.data; + return data; } -// 处理单页的列表数据 +// 处理单页的列表数据,保存进库 const parsePageObj = async (dataArr) => { for(let data of dataArr) { let record = new SpiderData({ @@ -61,25 +99,30 @@ const parsePageObj = async (dataArr) => { } } // 获取所有分类下的数据列表 -const parseAllCategory = async () => { - try { - let cateList = await SpiderCategory.find({type: 'book'}); - for(const cate of cateList) { - let firstPage = await bookList(cate.data.sex, cate.data.ltype_id, 1); - let totalPage = firstPage.data.last_page; - cate.data.total = firstPage.data.total; - cate.markModified('data'); - await cate.save(); - console.log(`parse ltype: ${cate.data.ltype_id}, totalPage: ${totalPage}`); - await parsePageObj(firstPage.data.list); - for(let i = 2; i <= totalPage; i ++) { - let pageObj = await bookList(cate.data.sex, cate.data.ltype_id, i); - await parsePageObj(pageObj.data.list); - } +const parseAllCategory = async (sex, ltype) => { + sex = sex || 1; + ltype = ltype || 0; + try { + let cateList = await SpiderCategory.find({type: 'book'}); + for(const cate of cateList) { + if (cate.data.sex < sex || cate.data.ltype_id < ltype) { + continue; + } + let firstPage = await bookList(cate.data.sex, cate.data.ltype_id, 1); + let totalPage = firstPage.data.last_page; + cate.data.total = firstPage.data.total; + cate.markModified('data'); + await cate.save(); + console.log(`parse ltype: ${cate.data.ltype_id}, totalPage: ${totalPage}`); + await parsePageObj(firstPage.data.list); + for(let i = 2; i <= totalPage; i ++) { + let pageObj = await bookList(cate.data.sex, cate.data.ltype_id, i); + await parsePageObj(pageObj.data.list); } - } catch (err) { - console.log(err); } + } catch (err) { + console.log(err); + } } const parseAllBookList = async () => { @@ -102,11 +145,15 @@ const parseAllBookList = async () => { export default { run: async () => { + //step 1: 获取所有的分类 // await categoryList(1); // await categoryList(2); - // await bookList(1, 0, 1); + // step 2: 获取所有分类下的书籍列表 // 获取所有分类下的数据列表 - // await parseAllCategory(); - await parseAllBookList(); + await parseAllCategory(2, 0); + // 获取所有数据的详情 + // await parseAllBookList(); + // let data = await bookChapterInfo(9011, 739477, 328, 'kxs2'); + // console.log(data.data); } } diff --git a/src/utils/net.util.js b/src/utils/net.util.js index b766b70..7c231fe 100644 --- a/src/utils/net.util.js +++ b/src/utils/net.util.js @@ -7,6 +7,9 @@ const requestData = (options) => { if (err) { return reject(err); } + if (response.statusCode >= 300) { + return reject(new Error('server response code: ' + response.statusCode)); + } resolve(body); }); });