增加抓取日志

This commit is contained in:
zhl 2019-05-09 11:14:52 +08:00
parent bbaa44f8c1
commit 250f17a35a
3 changed files with 78 additions and 8 deletions

View File

@ -14,3 +14,8 @@ sites/bookChapter.js
## 20190426 蛋蛋赞影视资料
sites/dandanzan.js
## 20190508 增加代理数据抓取
sites/proxy.js

View File

@ -0,0 +1,42 @@
import mongoose from 'mongoose';
import MovieModel from "../Movies";
let Schema = mongoose.Schema;
let CrawlRecordSchema = new Schema({
// 爬取的链接地址
url: {type: String},
// 执行爬取任务的class
className: {type: String},
// 执行爬取任务的方法
methodName: {type: String},
// 执行爬取任务的方法的参数
params: {type: Schema.Types.Mixed},
lastTry: {type: Date, default: new Date()},
tryCount: {type: Number, default: 0},
errCount: {type: Number, default: 0},
lastStatus: {type: Boolean, default: false}
}, {
collection: 'crawl_record',
timestamps: true
});
class CrawlRecordClass {
static async updateRecord(record) {
const query = {url: record.url};
const options = {upsert: true, setDefaultsOnInsert:true};
let incObj = {tryCount: 1}
if (!record.lastStatus) {
incObj.errCount = 1;
}
record['$inc'] = incObj;
record.lastTry = new Date();
await CrawlRecordModel.update(query, record, options);
}
}
CrawlRecordSchema.loadClass(CrawlRecordClass);
let CrawlRecordModel = mongoose.model('CrawlRecord', CrawlRecordSchema);
export default CrawlRecordModel;

View File

@ -1,10 +1,9 @@
import netUtil from "../utils/net.util";
import cheerio from "cheerio";
import stringUtil from '../utils/string.util';
import Movie from '../models/Movies';
import generalQueue from '../utils/general.queue';
import proxy from './proxy';
import proxyUtil from '../utils/proxy.util';
import CrawlRecord from '../models/spider/CrawlRecord';
const URL_BASE = 'https://www.dandanzan.com'
const maxIdx = 100000;
@ -14,7 +13,7 @@ const maxIdx = 100000;
* @param {String} category, 种类 movie: 电影 tv: 电视剧 show: 综艺
* @param {Number} sortIdx
* */
const parseOnePage = async (subLink, category, sortIdx) => {
const parseOnePage = async ({subLink, category, sortIdx}) => {
const url = `${URL_BASE}${subLink}`
try {
let html = await proxyUtil.getDataProxy(url)
@ -61,14 +60,26 @@ const parseOnePage = async (subLink, category, sortIdx) => {
record.sortIdx = sortIdx;
}
await record.save();
await CrawlRecord.updateRecord({url: url,
className: 'dandanzan',
methodName: 'parseOnePage',
params: {subLink, category, sortIdx},
statusCode: true,
})
console.log(`@@@@@ ${sortIdx}: ${subLink} @ ${record.name} saved`);
}
} catch (err) {
console.log(err);
await CrawlRecord.updateRecord({url: url,
className: 'dandanzan',
methodName: 'parseOnePage',
params: {subLink, category, sortIdx},
statusCode: false,
})
}
}
const parseListPage = async (idx, category) => {
const parseListPage = async ({idx, category}) => {
let subName = 'dianying';
let index = 0;
switch (category) {
@ -91,8 +102,20 @@ const parseListPage = async (idx, category) => {
let html;
try {
html = await proxyUtil.getDataProxy(url)
await CrawlRecord.updateRecord({url: url,
className: 'dandanzan',
methodName: 'parseListPage',
params: {idx, category},
statusCode: true,
})
} catch (err) {
console.log(err);
await CrawlRecord.updateRecord({url: url,
className: 'dandanzan',
methodName: 'parseListPage',
params: {idx, category},
statusCode: false,
})
}
if (html) {
const $ = cheerio.load(html);
@ -101,12 +124,12 @@ const parseListPage = async (idx, category) => {
$(hrefs).each(function(i, link){
pages.push($(this).attr('href'));
});
for(let page of pages) {
for(let subLink of pages) {
try {
let sortIdx = maxIdx - (idx * 24 + (index ++) );
generalQueue.addQueue({
run: async function () {
await parseOnePage(page, category, sortIdx);
await parseOnePage({subLink, category, sortIdx});
}
})
} catch (err) {
@ -136,11 +159,11 @@ const parseListPage = async (idx, category) => {
}
const parseAllMovie = async (category) => {
console.time('all');
let allPageNo = await parseListPage(0, category);
let allPageNo = await parseListPage({idx: 0, category: category});
console.log('app page is', allPageNo);
if (allPageNo > 1) {
for (let i = 1; i <= allPageNo; i++) {
await parseListPage(i, category);
await parseListPage({idx: i, category: category});
}
}
}