taptap 爬虫
This commit is contained in:
parent
1d7c902375
commit
6f99261670
22
.gitignore
vendored
Normal file
22
.gitignore
vendored
Normal file
@ -0,0 +1,22 @@
|
||||
/.env
|
||||
/.idea/
|
||||
|
||||
**/node_modules
|
||||
**/.DS_Store
|
||||
|
||||
/config/config.js
|
||||
/config/game_dic.js
|
||||
|
||||
/public
|
||||
/logs
|
||||
/build
|
||||
/dist
|
||||
/lib
|
||||
rev-manifest.json
|
||||
/yarn.lock
|
||||
/nohup.out
|
||||
/package-lock.json
|
||||
/dist.tar.gz
|
||||
*.swp
|
||||
/logsgarfield
|
||||
.vscode/launch.json
|
15
boundle.sh
Normal file
15
boundle.sh
Normal file
@ -0,0 +1,15 @@
|
||||
#!/bin/bash
|
||||
|
||||
source /etc/profile
|
||||
|
||||
npm install --prefer-offline --loglevel info --unsafe-perm=true --allow-root --build-from-source >> boundle.log
|
||||
echo 'copy node_modules to /data/publish/node_packages' >> boundle.log
|
||||
|
||||
gulp
|
||||
|
||||
rm -rf ./src
|
||||
|
||||
tar -zcvf pikachu_be.tar.gz ./
|
||||
|
||||
mkdir target && mv ./pikachu_be.tar.gz ./target/pikachu_be.tar.gz
|
||||
echo 'all done' 2>&1 >> boundle.log
|
24
gulpfile.babel.js
Normal file
24
gulpfile.babel.js
Normal file
@ -0,0 +1,24 @@
|
||||
'use strict'
|
||||
import gulp from 'gulp'
|
||||
import babel from 'gulp-babel'
|
||||
import sourcemaps from 'gulp-sourcemaps'
|
||||
|
||||
const compileCode = function() {
|
||||
return gulp
|
||||
.src(['src/**/*'])
|
||||
.pipe(sourcemaps.init())
|
||||
.pipe(
|
||||
babel({
|
||||
presets: ['node8'],
|
||||
})
|
||||
)
|
||||
.pipe(sourcemaps.write('.'))
|
||||
.pipe(gulp.dest('lib/'))
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
gulp.task('compile', compileCode)
|
||||
|
||||
gulp.task('default', ['compile'])
|
37
package.json
Normal file
37
package.json
Normal file
@ -0,0 +1,37 @@
|
||||
{
|
||||
"name": "taptap",
|
||||
"version": "1.0.0",
|
||||
"description": "",
|
||||
"main": "index.js",
|
||||
"scripts": {
|
||||
"test": "echo \"Error: no test specified\" && exit 1",
|
||||
"dev": "cross-env nodemon src/app.js --exec babel-node ",
|
||||
"pro": "cross-env NODE_ENV=production nodemon src/app.js --exec babel-node "
|
||||
},
|
||||
"repository": {
|
||||
"type": "git",
|
||||
"url": "http://git.kingsome.cn/yulixing/taptap.git"
|
||||
},
|
||||
"keywords": [],
|
||||
"author": "",
|
||||
"license": "ISC",
|
||||
"dependencies": {
|
||||
"cheerio": "^1.0.0-rc.3",
|
||||
"cross-env": "^6.0.0",
|
||||
"express": "^4.17.1",
|
||||
"moment": "^2.24.0",
|
||||
"mongoose": "^5.7.1",
|
||||
"node-schedule": "^1.3.2",
|
||||
"request": "^2.88.0",
|
||||
"superagent": "^5.1.0",
|
||||
"superagent-proxy": "^2.0.0"
|
||||
},
|
||||
"devDependencies": {
|
||||
"babel-cli": "^6.26.0",
|
||||
"babel-preset-node8": "1.2.0",
|
||||
"cross-env": "^5.2.0",
|
||||
"gulp": "3.9.1",
|
||||
"gulp-babel": "6.1.2",
|
||||
"gulp-sourcemaps": "^2.6.5"
|
||||
}
|
||||
}
|
32
src/app.js
Normal file
32
src/app.js
Normal file
@ -0,0 +1,32 @@
|
||||
import mongoose from 'mongoose'
|
||||
import schedule from 'node-schedule'
|
||||
|
||||
import config from '../config/config'
|
||||
import getCate from './spider/cate'
|
||||
|
||||
mongoose.connect(config.db_taptap, {
|
||||
useNewUrlParser: true,
|
||||
useUnifiedTopology: true,
|
||||
})
|
||||
|
||||
const scheduleCronstyle = () => {
|
||||
console.log('TapTap爬虫正在运行!')
|
||||
const timer = schedule.scheduleJob('0 0 10 * * *', () => {
|
||||
console.log(new Date() + '开始收集数据!')
|
||||
getAllData()
|
||||
})
|
||||
}
|
||||
|
||||
scheduleCronstyle()
|
||||
|
||||
function getAllData() {
|
||||
setTimeout(getCate, 0, 'download')
|
||||
setTimeout(getCate, 1200000, 'new')
|
||||
setTimeout(getCate, 2400000, 'reserve')
|
||||
setTimeout(getCate, 3600000, 'sell')
|
||||
setTimeout(getCate, 4800000, 'played')
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
33
src/model/Game.js
Normal file
33
src/model/Game.js
Normal file
@ -0,0 +1,33 @@
|
||||
'use strict'
|
||||
import mongoose from 'mongoose'
|
||||
|
||||
/**
|
||||
* 游戏信息
|
||||
*/
|
||||
const Game = new mongoose.Schema(
|
||||
{
|
||||
cateName: {type: String},
|
||||
icon: {type: String},
|
||||
order: {type: Number},
|
||||
title: {type: String},
|
||||
author: {type: String},
|
||||
score: {type: Number},
|
||||
desc: {type: String},
|
||||
tags: {type: Array},
|
||||
cate: {type: String},
|
||||
gameid: {type: Number},
|
||||
date: {type: String},
|
||||
review: {type: Number},
|
||||
topic: {type: Number},
|
||||
watch: {type: Number},
|
||||
reserve: {type: Number},
|
||||
sell: {type: Number},
|
||||
download: {type: Number},
|
||||
},
|
||||
{
|
||||
collection: 'games',
|
||||
timestamps: true,
|
||||
}
|
||||
)
|
||||
|
||||
export default mongoose.model('Game', Game)
|
125
src/spider/cate.js
Normal file
125
src/spider/cate.js
Normal file
@ -0,0 +1,125 @@
|
||||
import request from 'superagent'
|
||||
import cheerio from 'cheerio'
|
||||
import moment from 'moment'
|
||||
require('superagent-proxy')(request)
|
||||
|
||||
import config from '../../config/config'
|
||||
import {clearInterval} from 'timers'
|
||||
|
||||
import parseHtml from '../utils/parseHtml'
|
||||
|
||||
import getDetails from './details'
|
||||
|
||||
const date = moment().format('YYYY-MM-DD')
|
||||
|
||||
let next = ''
|
||||
let timer = ''
|
||||
let data = []
|
||||
|
||||
function getCate(cateName) {
|
||||
next = config.taptap[cateName]
|
||||
timer = setInterval(getData, 3000, cateName)
|
||||
}
|
||||
|
||||
async function getData(cateName) {
|
||||
try {
|
||||
if (next && next.startsWith('https://www.taptap.com')) {
|
||||
console.log(next)
|
||||
const res = await request.get(next)
|
||||
analyze(res.body.data.html, cateName)
|
||||
// next = res.body.data.next
|
||||
next = ''
|
||||
// console.log('data', data)
|
||||
} else {
|
||||
clearInterval(timer)
|
||||
getDetails(cateName, data)
|
||||
data = []
|
||||
}
|
||||
} catch (err) {
|
||||
console.log(err)
|
||||
}
|
||||
}
|
||||
|
||||
function analyze(str, cateName) {
|
||||
try {
|
||||
console.log('进入分析')
|
||||
str = parseHtml(str)
|
||||
|
||||
const $ = cheerio.load(str)
|
||||
const cards = $('.taptap-top-card')
|
||||
|
||||
cards.each(function(idx, element) {
|
||||
const $element = $(element)
|
||||
const info = {}
|
||||
|
||||
// 分类
|
||||
info.cateName = cateName
|
||||
|
||||
// 游戏图标
|
||||
info.icon = $element
|
||||
.find('.top-card-left .card-left-image img')
|
||||
.attr('src')
|
||||
|
||||
// 排行
|
||||
info.order = parseInt($element.find('.top-card-order-text').text()) - 30
|
||||
|
||||
// 标题
|
||||
info.title = $element
|
||||
.find('.top-card-middle .card-middle-title h4')
|
||||
.text()
|
||||
info.title = info.title.replace(/ */g, '')
|
||||
|
||||
// 产商
|
||||
info.author = $element
|
||||
.find('.top-card-middle .card-middle-author a')
|
||||
.text()
|
||||
|
||||
// 评分
|
||||
info.score = parseFloat(
|
||||
$element
|
||||
.find(
|
||||
'.top-card-middle .card-middle-score .middle-footer-rating span'
|
||||
)
|
||||
.text()
|
||||
)
|
||||
|
||||
// 描述
|
||||
info.desc = $element
|
||||
.find('.top-card-middle .card-middle-description')
|
||||
.text()
|
||||
info.desc = info.desc.replace(/ */g, '')
|
||||
|
||||
|
||||
// 标签
|
||||
const tags = $element.find('.top-card-middle .card-tags a')
|
||||
info.tags = []
|
||||
tags.each(function(idx, element) {
|
||||
const $element = $(element)
|
||||
info.tags.push($element.text())
|
||||
})
|
||||
|
||||
// 类型
|
||||
info.cate = $element
|
||||
.find('.top-card-middle .card-middle-category a')
|
||||
.text()
|
||||
|
||||
// ID
|
||||
const url = $element
|
||||
.find('.top-card-middle .card-middle-title')
|
||||
.attr('href')
|
||||
const regId = /https:\/\/www.taptap.com\/app\/(\d*)/
|
||||
const analyzeId = regId.exec(url)
|
||||
info.gameid = analyzeId ? parseInt(analyzeId[1]) : 0
|
||||
|
||||
// 收集日期
|
||||
info.date = date
|
||||
data.push(info)
|
||||
})
|
||||
|
||||
console.log('循环结束')
|
||||
} catch (err) {
|
||||
console.log(err)
|
||||
}
|
||||
}
|
||||
|
||||
export default getCate
|
55
src/spider/details.js
Normal file
55
src/spider/details.js
Normal file
@ -0,0 +1,55 @@
|
||||
import request from 'superagent'
|
||||
import cheerio from 'cheerio'
|
||||
require('superagent-proxy')(request)
|
||||
|
||||
import formWatch from '../utils/formWatch'
|
||||
import saveData from '../utils/saveData'
|
||||
|
||||
import {clearInterval} from 'timers'
|
||||
|
||||
let timer = ''
|
||||
let curIdx = 0
|
||||
let allData = []
|
||||
|
||||
function getDetails(cateName, data) {
|
||||
allData = []
|
||||
curIdx = 0
|
||||
timer = setInterval(getData, 3000, {cateName, data})
|
||||
}
|
||||
|
||||
async function getData({cateName, data}) {
|
||||
try {
|
||||
if (curIdx < data.length) {
|
||||
console.log(data[curIdx].order + ':' + data[curIdx].title)
|
||||
const url = `https://www.taptap.com/app/${data[curIdx].gameid}`
|
||||
const res = await request.get(url).timeout(2000)
|
||||
analyze(res.text, cateName, data[curIdx])
|
||||
curIdx += 1
|
||||
} else {
|
||||
clearInterval(timer)
|
||||
saveData(allData)
|
||||
allData = []
|
||||
}
|
||||
} catch (err) {
|
||||
console.log(err)
|
||||
}
|
||||
}
|
||||
|
||||
function analyze(str, cateName, info) {
|
||||
try {
|
||||
const $ = cheerio.load(str)
|
||||
|
||||
const watch = formWatch($('.count-stats').text())
|
||||
info.watch = watch.watch
|
||||
info.reserve = watch.reserve
|
||||
info.sell = watch.sell
|
||||
info.download = watch.download
|
||||
info.review = parseInt($('a[data-taptap-tab="review"] small').text()) || 0
|
||||
info.topic = parseInt($('a[data-taptap-tab="topic"] small').text()) || 0
|
||||
allData.push(info)
|
||||
} catch (err) {
|
||||
console.log(err)
|
||||
}
|
||||
}
|
||||
|
||||
export default getDetails
|
18
src/utils/formWatch.js
Normal file
18
src/utils/formWatch.js
Normal file
@ -0,0 +1,18 @@
|
||||
export default function formWatch(str) {
|
||||
const reg1 = /\D*(\d*) 人关注/g
|
||||
const reg2 = /\D*(\d*) 人预约/g
|
||||
const reg3 = /\D*(\d*) 人购买/g
|
||||
const reg4 = /\D*(\d*) 人安装/g
|
||||
const res = {}
|
||||
|
||||
const res1 = reg1.exec(str)
|
||||
res.watch = res1 ? parseInt(res1[1]) : 0
|
||||
const res2 = reg2.exec(str)
|
||||
res.reserve = res2 ? parseInt(res2[1]) : 0
|
||||
const res3 = reg3.exec(str)
|
||||
res.sell = res3 ? parseInt(res3[1]) : 0
|
||||
const res4 = reg4.exec(str)
|
||||
res.download = res4 ? parseInt(res4[1]) : 0
|
||||
|
||||
return res
|
||||
}
|
8
src/utils/parseHtml.js
Normal file
8
src/utils/parseHtml.js
Normal file
@ -0,0 +1,8 @@
|
||||
export default function parseHtml(str) {
|
||||
const reg1 = /\n/g
|
||||
const reg2 = /\\/g
|
||||
str = str.replace(reg1, '')
|
||||
str = str.replace(reg2, '')
|
||||
|
||||
return str
|
||||
}
|
10
src/utils/saveData.js
Normal file
10
src/utils/saveData.js
Normal file
@ -0,0 +1,10 @@
|
||||
import Game from '../model/Game'
|
||||
|
||||
export default function saveData(data) {
|
||||
try {
|
||||
Game.insertMany(data)
|
||||
console.log(`${data.length}条数据已保存`)
|
||||
} catch (err) {
|
||||
console.log(err)
|
||||
}
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user