diff --git a/.babelrc b/.babelrc new file mode 100644 index 0000000..71fb87a --- /dev/null +++ b/.babelrc @@ -0,0 +1,3 @@ +{ + "presets": ["node8"] +} \ No newline at end of file diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..64e38d7 --- /dev/null +++ b/.gitignore @@ -0,0 +1,22 @@ +/.env +/.idea/ + +**/node_modules +**/.DS_Store + +/config/config.js +/config/game_dic.js + +/public +/logs +/build +/dist +/lib +rev-manifest.json +/yarn.lock +/nohup.out +/package-lock.json +/dist.tar.gz +*.swp +/logsgarfield +.vscode/launch.json diff --git a/boundle.sh b/boundle.sh new file mode 100644 index 0000000..acc4574 --- /dev/null +++ b/boundle.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +source /etc/profile + +npm install --prefer-offline --loglevel info --unsafe-perm=true --allow-root --build-from-source >> boundle.log +echo 'copy node_modules to /data/publish/node_packages' >> boundle.log + +gulp + +rm -rf ./src + +tar -zcvf pikachu_be.tar.gz ./ + +mkdir target && mv ./pikachu_be.tar.gz ./target/pikachu_be.tar.gz +echo 'all done' 2>&1 >> boundle.log diff --git a/gulpfile.babel.js b/gulpfile.babel.js new file mode 100644 index 0000000..ad0bd99 --- /dev/null +++ b/gulpfile.babel.js @@ -0,0 +1,24 @@ +'use strict' +import gulp from 'gulp' +import babel from 'gulp-babel' +import sourcemaps from 'gulp-sourcemaps' + +const compileCode = function() { + return gulp + .src(['src/**/*']) + .pipe(sourcemaps.init()) + .pipe( + babel({ + presets: ['node8'], + }) + ) + .pipe(sourcemaps.write('.')) + .pipe(gulp.dest('lib/')) +} + + + + +gulp.task('compile', compileCode) + +gulp.task('default', ['compile']) diff --git a/package.json b/package.json new file mode 100644 index 0000000..2bfeedb --- /dev/null +++ b/package.json @@ -0,0 +1,37 @@ +{ + "name": "taptap", + "version": "1.0.0", + "description": "", + "main": "index.js", + "scripts": { + "test": "echo \"Error: no test specified\" && exit 1", + "dev": "cross-env nodemon src/app.js --exec babel-node ", + "pro": "cross-env NODE_ENV=production nodemon src/app.js --exec babel-node " + }, + "repository": { + "type": "git", + "url": "http://git.kingsome.cn/yulixing/taptap.git" + }, + "keywords": [], + "author": "", + "license": "ISC", + "dependencies": { + "cheerio": "^1.0.0-rc.3", + "cross-env": "^6.0.0", + "express": "^4.17.1", + "moment": "^2.24.0", + "mongoose": "^5.7.1", + "node-schedule": "^1.3.2", + "request": "^2.88.0", + "superagent": "^5.1.0", + "superagent-proxy": "^2.0.0" + }, + "devDependencies": { + "babel-cli": "^6.26.0", + "babel-preset-node8": "1.2.0", + "cross-env": "^5.2.0", + "gulp": "3.9.1", + "gulp-babel": "6.1.2", + "gulp-sourcemaps": "^2.6.5" + } +} diff --git a/src/app.js b/src/app.js new file mode 100644 index 0000000..e719218 --- /dev/null +++ b/src/app.js @@ -0,0 +1,32 @@ +import mongoose from 'mongoose' +import schedule from 'node-schedule' + +import config from '../config/config' +import getCate from './spider/cate' + +mongoose.connect(config.db_taptap, { + useNewUrlParser: true, + useUnifiedTopology: true, +}) + +const scheduleCronstyle = () => { + console.log('TapTap爬虫正在运行!') + const timer = schedule.scheduleJob('0 0 10 * * *', () => { + console.log(new Date() + '开始收集数据!') + getAllData() + }) +} + +scheduleCronstyle() + +function getAllData() { + setTimeout(getCate, 0, 'download') + setTimeout(getCate, 1200000, 'new') + setTimeout(getCate, 2400000, 'reserve') + setTimeout(getCate, 3600000, 'sell') + setTimeout(getCate, 4800000, 'played') +} + + + + diff --git a/src/model/Game.js b/src/model/Game.js new file mode 100644 index 0000000..b10ca8d --- /dev/null +++ b/src/model/Game.js @@ -0,0 +1,33 @@ +'use strict' +import mongoose from 'mongoose' + +/** + * 游戏信息 + */ +const Game = new mongoose.Schema( + { + cateName: {type: String}, + icon: {type: String}, + order: {type: Number}, + title: {type: String}, + author: {type: String}, + score: {type: Number}, + desc: {type: String}, + tags: {type: Array}, + cate: {type: String}, + gameid: {type: Number}, + date: {type: String}, + review: {type: Number}, + topic: {type: Number}, + watch: {type: Number}, + reserve: {type: Number}, + sell: {type: Number}, + download: {type: Number}, + }, + { + collection: 'games', + timestamps: true, + } +) + +export default mongoose.model('Game', Game) diff --git a/src/spider/cate.js b/src/spider/cate.js new file mode 100644 index 0000000..78691ea --- /dev/null +++ b/src/spider/cate.js @@ -0,0 +1,125 @@ +import request from 'superagent' +import cheerio from 'cheerio' +import moment from 'moment' +require('superagent-proxy')(request) + +import config from '../../config/config' +import {clearInterval} from 'timers' + +import parseHtml from '../utils/parseHtml' + +import getDetails from './details' + +const date = moment().format('YYYY-MM-DD') + +let next = '' +let timer = '' +let data = [] + +function getCate(cateName) { + next = config.taptap[cateName] + timer = setInterval(getData, 3000, cateName) +} + +async function getData(cateName) { + try { + if (next && next.startsWith('https://www.taptap.com')) { + console.log(next) + const res = await request.get(next) + analyze(res.body.data.html, cateName) + // next = res.body.data.next + next = '' + // console.log('data', data) + } else { + clearInterval(timer) + getDetails(cateName, data) + data = [] + } + } catch (err) { + console.log(err) + } +} + +function analyze(str, cateName) { + try { + console.log('进入分析') + str = parseHtml(str) + + const $ = cheerio.load(str) + const cards = $('.taptap-top-card') + + cards.each(function(idx, element) { + const $element = $(element) + const info = {} + + // 分类 + info.cateName = cateName + + // 游戏图标 + info.icon = $element + .find('.top-card-left .card-left-image img') + .attr('src') + + // 排行 + info.order = parseInt($element.find('.top-card-order-text').text()) - 30 + + // 标题 + info.title = $element + .find('.top-card-middle .card-middle-title h4') + .text() + info.title = info.title.replace(/ */g, '') + + // 产商 + info.author = $element + .find('.top-card-middle .card-middle-author a') + .text() + + // 评分 + info.score = parseFloat( + $element + .find( + '.top-card-middle .card-middle-score .middle-footer-rating span' + ) + .text() + ) + + // 描述 + info.desc = $element + .find('.top-card-middle .card-middle-description') + .text() + info.desc = info.desc.replace(/ */g, '') + + + // 标签 + const tags = $element.find('.top-card-middle .card-tags a') + info.tags = [] + tags.each(function(idx, element) { + const $element = $(element) + info.tags.push($element.text()) + }) + + // 类型 + info.cate = $element + .find('.top-card-middle .card-middle-category a') + .text() + + // ID + const url = $element + .find('.top-card-middle .card-middle-title') + .attr('href') + const regId = /https:\/\/www.taptap.com\/app\/(\d*)/ + const analyzeId = regId.exec(url) + info.gameid = analyzeId ? parseInt(analyzeId[1]) : 0 + + // 收集日期 + info.date = date + data.push(info) + }) + + console.log('循环结束') + } catch (err) { + console.log(err) + } +} + +export default getCate diff --git a/src/spider/details.js b/src/spider/details.js new file mode 100644 index 0000000..0c43f57 --- /dev/null +++ b/src/spider/details.js @@ -0,0 +1,55 @@ +import request from 'superagent' +import cheerio from 'cheerio' +require('superagent-proxy')(request) + +import formWatch from '../utils/formWatch' +import saveData from '../utils/saveData' + +import {clearInterval} from 'timers' + +let timer = '' +let curIdx = 0 +let allData = [] + +function getDetails(cateName, data) { + allData = [] + curIdx = 0 + timer = setInterval(getData, 3000, {cateName, data}) +} + +async function getData({cateName, data}) { + try { + if (curIdx < data.length) { + console.log(data[curIdx].order + ':' + data[curIdx].title) + const url = `https://www.taptap.com/app/${data[curIdx].gameid}` + const res = await request.get(url).timeout(2000) + analyze(res.text, cateName, data[curIdx]) + curIdx += 1 + } else { + clearInterval(timer) + saveData(allData) + allData = [] + } + } catch (err) { + console.log(err) + } +} + +function analyze(str, cateName, info) { + try { + const $ = cheerio.load(str) + + const watch = formWatch($('.count-stats').text()) + info.watch = watch.watch + info.reserve = watch.reserve + info.sell = watch.sell + info.download = watch.download + info.review = parseInt($('a[data-taptap-tab="review"] small').text()) || 0 + info.topic = parseInt($('a[data-taptap-tab="topic"] small').text()) || 0 + allData.push(info) + } catch (err) { + console.log(err) + } +} + +export default getDetails diff --git a/src/utils/formWatch.js b/src/utils/formWatch.js new file mode 100644 index 0000000..a9d7db5 --- /dev/null +++ b/src/utils/formWatch.js @@ -0,0 +1,18 @@ +export default function formWatch(str) { + const reg1 = /\D*(\d*) 人关注/g + const reg2 = /\D*(\d*) 人预约/g + const reg3 = /\D*(\d*) 人购买/g + const reg4 = /\D*(\d*) 人安装/g + const res = {} + + const res1 = reg1.exec(str) + res.watch = res1 ? parseInt(res1[1]) : 0 + const res2 = reg2.exec(str) + res.reserve = res2 ? parseInt(res2[1]) : 0 + const res3 = reg3.exec(str) + res.sell = res3 ? parseInt(res3[1]) : 0 + const res4 = reg4.exec(str) + res.download = res4 ? parseInt(res4[1]) : 0 + + return res +} diff --git a/src/utils/parseHtml.js b/src/utils/parseHtml.js new file mode 100644 index 0000000..b9e9282 --- /dev/null +++ b/src/utils/parseHtml.js @@ -0,0 +1,8 @@ +export default function parseHtml(str) { + const reg1 = /\n/g + const reg2 = /\\/g + str = str.replace(reg1, '') + str = str.replace(reg2, '') + + return str +} diff --git a/src/utils/saveData.js b/src/utils/saveData.js new file mode 100644 index 0000000..1920cba --- /dev/null +++ b/src/utils/saveData.js @@ -0,0 +1,10 @@ +import Game from '../model/Game' + +export default function saveData(data) { + try { + Game.insertMany(data) + console.log(`${data.length}条数据已保存`) + } catch (err) { + console.log(err) + } +}