修改电影抓取逻辑

This commit is contained in:
zhl 2019-05-08 13:49:18 +08:00
parent 593bd3e758
commit 20492021ea
5 changed files with 111 additions and 62 deletions

View File

@ -42,6 +42,8 @@ const Movies = new Schema({
open: {type: Boolean, default: false},
// 购买价格
price: {type: Number, default: 10},
// 是否是新记录
is_new: {type: Boolean, default: true},
// 视频分类movie: 电影tv: 电视剧; show: 综艺节目
category: {type: String},
// 是否已删除

View File

@ -5,25 +5,23 @@ import Movie from '../models/Movies';
import generalQueue from '../utils/general.queue';
const URL_BASE = 'https://www.dandanzan.com'
const maxIdx = 100000;
/**
* 处理一个页面
* @param {String} subLink
* @param {String} category, 种类 movie: 电影 tv: 电视剧 show: 综艺
* @param {Number} sortIdx
* */
const parseOnePage = async (subLink, category) => {
const parseOnePage = async (subLink, category, sortIdx) => {
const url = `${URL_BASE}${subLink}`
try {
let html = await netUtil.getData(url, {})
let html = await netUtil.getDataProxy(url, {})
const $ = cheerio.load(html);
if ($('.error404').text()) {
console.log(`>>>>>>>>>>>> ${url} not found`);
} else {
let resourceStr = stringUtil.getContentByReg(html,/links='(.+?)\|'/);
let idx = parseInt(subLink.replace('/dianying/', '')
.replace('.html', '')
.replace('/dongman/', '')
.replace('/dianshiju/', '')
.replace('/zongyi/', ''));
let idx = parseInt(stringUtil.findByReg(subLink, /\/.+?\/(\d+?)\.html/));
const type = $('meta[property="og:video:class"]').attr('content');
let typeArr = type ? type.split(',') : [];
let arr = resourceStr.replace(/\|/g, '#').split('#');
@ -39,7 +37,7 @@ const parseOnePage = async (subLink, category) => {
let region = $('meta[property="og:video:area"]').attr('content');
let record = await Movie.findByGid(idx);
if (!record) {
record = {
record = new Movie({
gid: idx,
name: $('meta[property="og:title"]').attr('content'),
type: typeArr,
@ -53,63 +51,24 @@ const parseOnePage = async (subLink, category) => {
region: region ? region.split(',') : [],
year: Number($('meta[property="og:video:release_date"]').attr('content')),
category: category,
}
sortIdx: sortIdx,
is_new: true,
})
} else {
record.resources = resourceArr;
record.sortIdx = sortIdx;
}
await record.save();
console.log(`@@@@@ ${subLink} @ ${record.name} saved`);
console.log(`@@@@@ ${sortIdx}: ${subLink} @ ${record.name} saved`);
}
} catch (err) {
console.log(err);
}
}
const parseListPage = async (subPage, category) => {
const url = `${URL_BASE}${subPage}`
console.log(`begin parse category: ${category} page: ${subPage}`);
let html;
try {
html = await netUtil.getData(url, {})
} catch (err) {
console.log(err);
}
if (html) {
const $ = cheerio.load(html);
let hrefs = $('.thumbnail');
let pages = [];
$(hrefs).each(function(i, link){
pages.push($(this).attr('href'));
});
for(let page of pages) {
try {
generalQueue.addQueue({
run: async function () {
await parseOnePage(page, category);
}
})
} catch (err) {
console.log(err);
}
}
if ($('.next-page')) {
let nextStr = $('.next-page a').attr('href');
console.log('has next page: ', nextStr);
try {
await parseListPage(nextStr, category);
} catch (err) {
console.log(err);
}
} else {
console.log('########################### ALL LIST PAGE END ###########################');
}
}
console.log(`end parse category: ${category} page: ${subPage}`);
}
const parseAllMovie = async (category) => {
console.time('all');
const parseListPage = async (idx, category) => {
let subName = 'dianying';
let index = 0;
switch (category) {
case 'movie':
subName = 'dianying';
@ -124,16 +83,72 @@ const parseAllMovie = async (category) => {
subName = 'dongman';
break;
}
const subPage = `/${subName}/index.html`
await parseListPage(subPage, category);
const subPage = !idx ? `/${subName}/----onclick.html` : `/${subName}/---${idx}-onclick.html`
const url = `${URL_BASE}${subPage}`
console.log(`begin parse category: ${category} page: ${subPage}`);
let html;
try {
html = await netUtil.getDataProxy(url, {})
} catch (err) {
console.log(err);
}
if (html) {
const $ = cheerio.load(html);
let hrefs = $('.thumbnail');
let pages = [];
$(hrefs).each(function(i, link){
pages.push($(this).attr('href'));
});
for(let page of pages) {
try {
let sortIdx = maxIdx - (idx * 24 + (index ++) );
generalQueue.addQueue({
run: async function () {
await parseOnePage(page, category, sortIdx);
}
})
} catch (err) {
console.log(err);
}
}
if (!idx) {
const lastPage = $('.pagination ul li').last().find('a').attr('href');
return parseInt(stringUtil.findByReg(lastPage, /\/.+?\/---(\d+?)-onclick\.html/));
} else {
return 1;
}
// if ($('.next-page')) {
// let nextStr = $('.next-page a').attr('href');
// console.log('has next page: ', nextStr);
// try {
// await parseListPage(nextStr, category);
// } catch (err) {
// console.log(err);
// }
// } else {
// console.log('########################### ALL LIST PAGE END ###########################');
// }
} else {
return 1;
}
}
const parseAllMovie = async (category) => {
console.time('all');
let allPageNo = await parseListPage(0, category);
console.log('app page is', allPageNo);
if (allPageNo > 1) {
for (let i = 1; i <= allPageNo; i++) {
await parseListPage(i, category);
}
}
}
export default {
run: async () => {
await parseAllMovie('movie');
// await parseAllMovie('movie');
// await parseAllMovie('tv');
// await parseAllMovie('show');
// await parseAllMovie('cartoon');
await parseAllMovie('cartoon');
// console.log('all done');
// let html = await netUtil.getData('https://wechat-test.kingsome.cn/', {})
// console.log(html);

View File

@ -14,7 +14,7 @@ const requestData = (options, encoding) => {
return reject(err);
}
if (response.statusCode >= 300) {
return reject(new Error('server response code: ' + response.statusCode));
return reject(new Error(' server response code: ' + response.statusCode + ' with url: ' + options.url));
}
if (encoding) {
body = iconv.decode(body, encoding);
@ -40,6 +40,22 @@ export default {
return requestData(options);
},
getData(url, header, encoding, gzip) {
header = header || {
'Cache-Control': 'no-cache',
}
header['User-Agent'] = random_useragent.getRandom();
const options = {
method: 'GET',
url: url,
headers: header,
};
if (encoding) {
options.encoding = null;
}
(gzip) && (options.gzip = true);
return requestData(options, encoding);
},
getDataProxy(url, header, encoding, gzip) {
header = header || {
'Cache-Control': 'no-cache',
}

View File

@ -1,8 +1,10 @@
import stringUtil from './string.util';
const proxys = [
'http://101.71.41.169:443',
'http://116.196.81.58:3128',
'http://113.200.56.13:8010'
'http://113.200.56.13:8010',
'http://65.52.174.40:80',
'http://165.22.254.199:8080',
'http://88.255.101.241:8080',
'http://117.197.117.50:8080'
];
export default {

View File

@ -21,6 +21,20 @@ export default {
return content.replace(/<.+?>/g, '').replace(/\s/g, '');
}
},
/**
* 根据正则查找内容
* @param {string} content
* @param {RegExp} re 正则表达式 例如/\/.+?\/(\d+?)\.html/
* @return {string} 匹配到的内容 没找到则返回''
* */
findByReg(content, re) {
const contents = content.match(re);
let result = '';
if (contents) {
if (contents.length > 1) result = contents[1];
}
return result;
},
randomNum(minNum, maxNum) {
return parseInt(Math.random()*(maxNum-minNum+1)+minNum, 10);
},