From 250f17a35a351a6acaeee4d58c0768a070de3348 Mon Sep 17 00:00:00 2001 From: zhl Date: Thu, 9 May 2019 11:14:52 +0800 Subject: [PATCH] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E6=8A=93=E5=8F=96=E6=97=A5?= =?UTF-8?q?=E5=BF=97?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 5 ++++ src/models/spider/CrawlRecord.js | 42 ++++++++++++++++++++++++++++++++ src/sites/dandanzan.js | 39 +++++++++++++++++++++++------ 3 files changed, 78 insertions(+), 8 deletions(-) create mode 100644 src/models/spider/CrawlRecord.js diff --git a/README.md b/README.md index 1f25ea4..55de5b2 100644 --- a/README.md +++ b/README.md @@ -14,3 +14,8 @@ sites/bookChapter.js ## 20190426 蛋蛋赞影视资料 sites/dandanzan.js + + +## 20190508 增加代理数据抓取 + +sites/proxy.js diff --git a/src/models/spider/CrawlRecord.js b/src/models/spider/CrawlRecord.js new file mode 100644 index 0000000..6076558 --- /dev/null +++ b/src/models/spider/CrawlRecord.js @@ -0,0 +1,42 @@ +import mongoose from 'mongoose'; +import MovieModel from "../Movies"; + +let Schema = mongoose.Schema; + + +let CrawlRecordSchema = new Schema({ + // 爬取的链接地址 + url: {type: String}, + // 执行爬取任务的class + className: {type: String}, + // 执行爬取任务的方法 + methodName: {type: String}, + // 执行爬取任务的方法的参数 + params: {type: Schema.Types.Mixed}, + lastTry: {type: Date, default: new Date()}, + tryCount: {type: Number, default: 0}, + errCount: {type: Number, default: 0}, + lastStatus: {type: Boolean, default: false} +}, { + collection: 'crawl_record', + timestamps: true +}); + +class CrawlRecordClass { + static async updateRecord(record) { + const query = {url: record.url}; + const options = {upsert: true, setDefaultsOnInsert:true}; + let incObj = {tryCount: 1} + if (!record.lastStatus) { + incObj.errCount = 1; + } + record['$inc'] = incObj; + record.lastTry = new Date(); + await CrawlRecordModel.update(query, record, options); + } +} +CrawlRecordSchema.loadClass(CrawlRecordClass); + +let CrawlRecordModel = mongoose.model('CrawlRecord', CrawlRecordSchema); + +export default CrawlRecordModel; diff --git a/src/sites/dandanzan.js b/src/sites/dandanzan.js index 1180303..dcb1ac4 100644 --- a/src/sites/dandanzan.js +++ b/src/sites/dandanzan.js @@ -1,10 +1,9 @@ -import netUtil from "../utils/net.util"; import cheerio from "cheerio"; import stringUtil from '../utils/string.util'; import Movie from '../models/Movies'; import generalQueue from '../utils/general.queue'; -import proxy from './proxy'; import proxyUtil from '../utils/proxy.util'; +import CrawlRecord from '../models/spider/CrawlRecord'; const URL_BASE = 'https://www.dandanzan.com' const maxIdx = 100000; @@ -14,7 +13,7 @@ const maxIdx = 100000; * @param {String} category, 种类, movie: 电影, tv: 电视剧, show: 综艺 * @param {Number} sortIdx * */ -const parseOnePage = async (subLink, category, sortIdx) => { +const parseOnePage = async ({subLink, category, sortIdx}) => { const url = `${URL_BASE}${subLink}` try { let html = await proxyUtil.getDataProxy(url) @@ -61,14 +60,26 @@ const parseOnePage = async (subLink, category, sortIdx) => { record.sortIdx = sortIdx; } await record.save(); + await CrawlRecord.updateRecord({url: url, + className: 'dandanzan', + methodName: 'parseOnePage', + params: {subLink, category, sortIdx}, + statusCode: true, + }) console.log(`@@@@@ ${sortIdx}: ${subLink} @ ${record.name} saved`); } } catch (err) { console.log(err); + await CrawlRecord.updateRecord({url: url, + className: 'dandanzan', + methodName: 'parseOnePage', + params: {subLink, category, sortIdx}, + statusCode: false, + }) } } -const parseListPage = async (idx, category) => { +const parseListPage = async ({idx, category}) => { let subName = 'dianying'; let index = 0; switch (category) { @@ -91,8 +102,20 @@ const parseListPage = async (idx, category) => { let html; try { html = await proxyUtil.getDataProxy(url) + await CrawlRecord.updateRecord({url: url, + className: 'dandanzan', + methodName: 'parseListPage', + params: {idx, category}, + statusCode: true, + }) } catch (err) { console.log(err); + await CrawlRecord.updateRecord({url: url, + className: 'dandanzan', + methodName: 'parseListPage', + params: {idx, category}, + statusCode: false, + }) } if (html) { const $ = cheerio.load(html); @@ -101,12 +124,12 @@ const parseListPage = async (idx, category) => { $(hrefs).each(function(i, link){ pages.push($(this).attr('href')); }); - for(let page of pages) { + for(let subLink of pages) { try { let sortIdx = maxIdx - (idx * 24 + (index ++) ); generalQueue.addQueue({ run: async function () { - await parseOnePage(page, category, sortIdx); + await parseOnePage({subLink, category, sortIdx}); } }) } catch (err) { @@ -136,11 +159,11 @@ const parseListPage = async (idx, category) => { } const parseAllMovie = async (category) => { console.time('all'); - let allPageNo = await parseListPage(0, category); + let allPageNo = await parseListPage({idx: 0, category: category}); console.log('app page is', allPageNo); if (allPageNo > 1) { for (let i = 1; i <= allPageNo; i++) { - await parseListPage(i, category); + await parseListPage({idx: i, category: category}); } } }