taptap 爬虫

This commit is contained in:
yulixing 2019-09-30 14:59:42 +08:00
parent 1d7c902375
commit 6f99261670
12 changed files with 382 additions and 0 deletions

3
.babelrc Normal file
View File

@ -0,0 +1,3 @@
{
"presets": ["node8"]
}

22
.gitignore vendored Normal file
View File

@ -0,0 +1,22 @@
/.env
/.idea/
**/node_modules
**/.DS_Store
/config/config.js
/config/game_dic.js
/public
/logs
/build
/dist
/lib
rev-manifest.json
/yarn.lock
/nohup.out
/package-lock.json
/dist.tar.gz
*.swp
/logsgarfield
.vscode/launch.json

15
boundle.sh Normal file
View File

@ -0,0 +1,15 @@
#!/bin/bash
source /etc/profile
npm install --prefer-offline --loglevel info --unsafe-perm=true --allow-root --build-from-source >> boundle.log
echo 'copy node_modules to /data/publish/node_packages' >> boundle.log
gulp
rm -rf ./src
tar -zcvf pikachu_be.tar.gz ./
mkdir target && mv ./pikachu_be.tar.gz ./target/pikachu_be.tar.gz
echo 'all done' 2>&1 >> boundle.log

24
gulpfile.babel.js Normal file
View File

@ -0,0 +1,24 @@
'use strict'
import gulp from 'gulp'
import babel from 'gulp-babel'
import sourcemaps from 'gulp-sourcemaps'
const compileCode = function() {
return gulp
.src(['src/**/*'])
.pipe(sourcemaps.init())
.pipe(
babel({
presets: ['node8'],
})
)
.pipe(sourcemaps.write('.'))
.pipe(gulp.dest('lib/'))
}
gulp.task('compile', compileCode)
gulp.task('default', ['compile'])

37
package.json Normal file
View File

@ -0,0 +1,37 @@
{
"name": "taptap",
"version": "1.0.0",
"description": "",
"main": "index.js",
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1",
"dev": "cross-env nodemon src/app.js --exec babel-node ",
"pro": "cross-env NODE_ENV=production nodemon src/app.js --exec babel-node "
},
"repository": {
"type": "git",
"url": "http://git.kingsome.cn/yulixing/taptap.git"
},
"keywords": [],
"author": "",
"license": "ISC",
"dependencies": {
"cheerio": "^1.0.0-rc.3",
"cross-env": "^6.0.0",
"express": "^4.17.1",
"moment": "^2.24.0",
"mongoose": "^5.7.1",
"node-schedule": "^1.3.2",
"request": "^2.88.0",
"superagent": "^5.1.0",
"superagent-proxy": "^2.0.0"
},
"devDependencies": {
"babel-cli": "^6.26.0",
"babel-preset-node8": "1.2.0",
"cross-env": "^5.2.0",
"gulp": "3.9.1",
"gulp-babel": "6.1.2",
"gulp-sourcemaps": "^2.6.5"
}
}

32
src/app.js Normal file
View File

@ -0,0 +1,32 @@
import mongoose from 'mongoose'
import schedule from 'node-schedule'
import config from '../config/config'
import getCate from './spider/cate'
mongoose.connect(config.db_taptap, {
useNewUrlParser: true,
useUnifiedTopology: true,
})
const scheduleCronstyle = () => {
console.log('TapTap爬虫正在运行')
const timer = schedule.scheduleJob('0 0 10 * * *', () => {
console.log(new Date() + '开始收集数据!')
getAllData()
})
}
scheduleCronstyle()
function getAllData() {
setTimeout(getCate, 0, 'download')
setTimeout(getCate, 1200000, 'new')
setTimeout(getCate, 2400000, 'reserve')
setTimeout(getCate, 3600000, 'sell')
setTimeout(getCate, 4800000, 'played')
}

33
src/model/Game.js Normal file
View File

@ -0,0 +1,33 @@
'use strict'
import mongoose from 'mongoose'
/**
* 游戏信息
*/
const Game = new mongoose.Schema(
{
cateName: {type: String},
icon: {type: String},
order: {type: Number},
title: {type: String},
author: {type: String},
score: {type: Number},
desc: {type: String},
tags: {type: Array},
cate: {type: String},
gameid: {type: Number},
date: {type: String},
review: {type: Number},
topic: {type: Number},
watch: {type: Number},
reserve: {type: Number},
sell: {type: Number},
download: {type: Number},
},
{
collection: 'games',
timestamps: true,
}
)
export default mongoose.model('Game', Game)

125
src/spider/cate.js Normal file
View File

@ -0,0 +1,125 @@
import request from 'superagent'
import cheerio from 'cheerio'
import moment from 'moment'
require('superagent-proxy')(request)
import config from '../../config/config'
import {clearInterval} from 'timers'
import parseHtml from '../utils/parseHtml'
import getDetails from './details'
const date = moment().format('YYYY-MM-DD')
let next = ''
let timer = ''
let data = []
function getCate(cateName) {
next = config.taptap[cateName]
timer = setInterval(getData, 3000, cateName)
}
async function getData(cateName) {
try {
if (next && next.startsWith('https://www.taptap.com')) {
console.log(next)
const res = await request.get(next)
analyze(res.body.data.html, cateName)
// next = res.body.data.next
next = ''
// console.log('data', data)
} else {
clearInterval(timer)
getDetails(cateName, data)
data = []
}
} catch (err) {
console.log(err)
}
}
function analyze(str, cateName) {
try {
console.log('进入分析')
str = parseHtml(str)
const $ = cheerio.load(str)
const cards = $('.taptap-top-card')
cards.each(function(idx, element) {
const $element = $(element)
const info = {}
// 分类
info.cateName = cateName
// 游戏图标
info.icon = $element
.find('.top-card-left .card-left-image img')
.attr('src')
// 排行
info.order = parseInt($element.find('.top-card-order-text').text()) - 30
// 标题
info.title = $element
.find('.top-card-middle .card-middle-title h4')
.text()
info.title = info.title.replace(/ */g, '')
// 产商
info.author = $element
.find('.top-card-middle .card-middle-author a')
.text()
// 评分
info.score = parseFloat(
$element
.find(
'.top-card-middle .card-middle-score .middle-footer-rating span'
)
.text()
)
// 描述
info.desc = $element
.find('.top-card-middle .card-middle-description')
.text()
info.desc = info.desc.replace(/ */g, '')
// 标签
const tags = $element.find('.top-card-middle .card-tags a')
info.tags = []
tags.each(function(idx, element) {
const $element = $(element)
info.tags.push($element.text())
})
// 类型
info.cate = $element
.find('.top-card-middle .card-middle-category a')
.text()
// ID
const url = $element
.find('.top-card-middle .card-middle-title')
.attr('href')
const regId = /https:\/\/www.taptap.com\/app\/(\d*)/
const analyzeId = regId.exec(url)
info.gameid = analyzeId ? parseInt(analyzeId[1]) : 0
// 收集日期
info.date = date
data.push(info)
})
console.log('循环结束')
} catch (err) {
console.log(err)
}
}
export default getCate

55
src/spider/details.js Normal file
View File

@ -0,0 +1,55 @@
import request from 'superagent'
import cheerio from 'cheerio'
require('superagent-proxy')(request)
import formWatch from '../utils/formWatch'
import saveData from '../utils/saveData'
import {clearInterval} from 'timers'
let timer = ''
let curIdx = 0
let allData = []
function getDetails(cateName, data) {
allData = []
curIdx = 0
timer = setInterval(getData, 3000, {cateName, data})
}
async function getData({cateName, data}) {
try {
if (curIdx < data.length) {
console.log(data[curIdx].order + ':' + data[curIdx].title)
const url = `https://www.taptap.com/app/${data[curIdx].gameid}`
const res = await request.get(url).timeout(2000)
analyze(res.text, cateName, data[curIdx])
curIdx += 1
} else {
clearInterval(timer)
saveData(allData)
allData = []
}
} catch (err) {
console.log(err)
}
}
function analyze(str, cateName, info) {
try {
const $ = cheerio.load(str)
const watch = formWatch($('.count-stats').text())
info.watch = watch.watch
info.reserve = watch.reserve
info.sell = watch.sell
info.download = watch.download
info.review = parseInt($('a[data-taptap-tab="review"] small').text()) || 0
info.topic = parseInt($('a[data-taptap-tab="topic"] small').text()) || 0
allData.push(info)
} catch (err) {
console.log(err)
}
}
export default getDetails

18
src/utils/formWatch.js Normal file
View File

@ -0,0 +1,18 @@
export default function formWatch(str) {
const reg1 = /\D*(\d*) 人关注/g
const reg2 = /\D*(\d*) 人预约/g
const reg3 = /\D*(\d*) 人购买/g
const reg4 = /\D*(\d*) 人安装/g
const res = {}
const res1 = reg1.exec(str)
res.watch = res1 ? parseInt(res1[1]) : 0
const res2 = reg2.exec(str)
res.reserve = res2 ? parseInt(res2[1]) : 0
const res3 = reg3.exec(str)
res.sell = res3 ? parseInt(res3[1]) : 0
const res4 = reg4.exec(str)
res.download = res4 ? parseInt(res4[1]) : 0
return res
}

8
src/utils/parseHtml.js Normal file
View File

@ -0,0 +1,8 @@
export default function parseHtml(str) {
const reg1 = /\n/g
const reg2 = /\\/g
str = str.replace(reg1, '')
str = str.replace(reg2, '')
return str
}

10
src/utils/saveData.js Normal file
View File

@ -0,0 +1,10 @@
import Game from '../model/Game'
export default function saveData(data) {
try {
Game.insertMany(data)
console.log(`${data.length}条数据已保存`)
} catch (err) {
console.log(err)
}
}