diff --git a/src/models/spider/CrawlRecord.js b/src/models/spider/CrawlRecord.js index 9307f22..9c029a2 100644 --- a/src/models/spider/CrawlRecord.js +++ b/src/models/spider/CrawlRecord.js @@ -1,5 +1,4 @@ import mongoose from 'mongoose'; -import MovieModel from "../Movies"; let Schema = mongoose.Schema; diff --git a/src/models/spider/CrawlSession.js b/src/models/spider/CrawlSession.js new file mode 100644 index 0000000..89a8d6c --- /dev/null +++ b/src/models/spider/CrawlSession.js @@ -0,0 +1,36 @@ +import mongoose from 'mongoose'; + +let Schema = mongoose.Schema; + +let CrawlSessionSchema = new Schema({ + beginTime: {type: Date}, + endTime: {type: Date}, + name: {type: String}, + step: {type: Number}, + // YYYY-MM-DD + day: {type: String}, + // 0: 进行中, 1:已结束 + status: {Number} +}, { + collection: 'crawl_session', + timestamps: true +}); + +class CrawlSessionClass { + static async updateRecord(record) { + const query = {url: record.url}; + const options = {upsert: true, setDefaultsOnInsert:true}; + let incObj = {tryCount: 1} + if (!record.lastStatus) { + incObj.errCount = 1; + } + record['$inc'] = incObj; + record.lastTry = new Date(); + await CrawlSessionModel.update(query, record, options); + } +} +CrawlSessionSchema.loadClass(CrawlSessionClass); + +let CrawlSessionModel = mongoose.model('CrawlSession', CrawlSessionSchema); + +export default CrawlSessionModel; diff --git a/src/sites/dandanzan.js b/src/sites/dandanzan.js index 339d7c7..1ed5102 100644 --- a/src/sites/dandanzan.js +++ b/src/sites/dandanzan.js @@ -154,12 +154,12 @@ const parseListPage = async ({idx, category}) => { return 1; } } -const parseAllMovie = async (category) => { +const parseAllMovie = async (category, beginNo = 1) => { console.time('all'); let allPageNo = await parseListPage({idx: 0, category: category}); console.log('app page is', allPageNo); if (allPageNo > 1) { - for (let i = 1; i <= allPageNo; i++) { + for (let i = beginNo; i <= allPageNo; i++) { await parseListPage({idx: i, category: category}); } }