知乎爬虫

工作需要编写爬虫爬取知乎话题,就随手用node写了一个,记录下来分享一下,存储部分是python写的,这里只写爬虫实现

文件结构

  1. zhihu_starter.js 爬虫启动器,负责定时执行任务,获取需要爬取的id列表
  2. zhihu_spider.js 爬虫核心程序,负责爬取数据返回,这里启动了8084端口
  3. spiderFunctions.js 爬虫数据处理,负责url拼接,以及对zhihu_spider获取的数据筛选存储等

zhihu_starter.js

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
//https://www.zhihu.com/topic/19567878/hot//待爬取地址

let schedule = require('node-schedule');
const fs = require("fs");
const axios = require('axios')

const sf = require("./SpiderFunctions");

var rule1 = new schedule.RecurrenceRule();
var times1 = [5, 15, 25, 35, 45, 55];
rule1.minute = times1;

let time = 1;//心跳次数

function start(ids) {
console.log(`第${time}次心跳`);
time++;
new Promise((resolve, reject) => {
//预留
resolve(ids);
}).then(ids => {
var i = 0; // 在外面定义一个变量作为判断的标准
if( !ids ){return false}
var timer = setInterval(function () {
if (i > ids.length - 1) { // 因为i++的原因,所以当i的值大于数组的长度-1的时候,清除定时器
clearInterval(timer)
}else{
console.log(`ids共${ids.length}个,中第${i}个开始执行`);
sf.getList(ids[i], (cb_status) => {
console.log(cb_status.msg);
console.log( ('=').repeat(30) );
});
i++
}

}, 2000)//每20秒执行下一个
});
}

schedule.scheduleJob(rule1, () => {
new Promise((resolve,reject)=>{
//获取要抓取的id列表
axios.get(`https://屏蔽/management/source?cate=1`).then(res=>{
if( res.data.status_code == 200 ){
let ids = [];
let __ids = res.data.data.filter(_=>{
return _.key === '知乎'
})[0];
ids = __ids.content;
console.log( ('=').repeat(30) );
console.log(`请求知乎列表成功`);
resolve( ids );
}else{
console.log( `https://屏蔽/management/source?cate=1请求失败` );
}
});
}).then((ids)=>{
start(ids);
});

})

zhihu_spider.js

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
//知乎爬虫
//https://www.zhihu.com/api/v4/topics/19567878/feeds/top_activity?limit=5&after_id=5
var express = require('express');
const request = require('request');
const cheerio = require(`cheerio`);
const moment = require('moment');
var app = express();



//抓取知乎详情
app.get('/',async function(req, res, next) {
let url = req.query.url?req.query.url:'';
if( !url ){
res.send({
status_code: 405,
message: `域名不正确,当前域名是${url}`
});
return false;
}
await request({
url: url,
headers: {
"User-Agent": "Mozilla/5.0 (Linux; Android 5.0; SM-G900P Build/LRX21T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Mobile Safari/537.36",
"X-Requested-With": "XMLHttpRequest",
"Content-Type": "application/x-www-form-urlencoded"
},
}, function (error, response, body) {
var complete_data = {}
try {
complete_data = JSON.parse(body);
res.send({
status_code: 200,
status: `success`,
complete_data: complete_data
});
} catch (error) {
res.send({
status_code: 204,
status: `请求数据失败`,
complete_data: body
});
}
})
});
app.listen(8084,'0.0.0.0');

spiderFunctions.js

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
const axios = require('axios')


function _getList(id, callback){//id,待抓取,id只是其中一个
let cb_status = {
status: 0,
msg: ``
}
new Promise((resolve,reject)=>{
//抓取数据::8084
// let cate_id = id.id;
// let newstype_id = id.newstype_id;
let {cate,cate_id,newstype_id,sign,source_name} = id;
let target_url = `http://127.0.0.1:8084/?url=https://www.zhihu.com/api/v4/topics/${cate_id}/feeds/timeline_activity?limit=5&after_id=5`
axios.get(target_url).then(res=>{
if( res.data.status_code == 200 ){
cb_status.status = 200;
cb_status.msg = `抓取数据成功`;
resolve({
cate: cate,
cate_id: cate_id,
complete_data:res.data.complete_data ,
newstype_id:newstype_id ,
sign:sign ,
source_name:source_name,
target_url: target_url
});
}else{
console.log( `抓取数据失败` );
cb_status.status = 201;
cb_status.msg = `抓取数据失败`;
resolve(cb_status);
}

});
}).then(({cate , cate_id , complete_data, newstype_id , sign , source_name, target_url})=>{



complete_data.data.forEach(element => {
(function(one,callback){
let default_data = {
content: "",
content_url: [],
cover_type: 145,
cover_url: [ //默认图传空数组
/*
{height: 164,
url_name: "屏蔽.jpg",
url_type: "图片",
width: 248}
*/
],
introduction: ``,
is_update: 0,
media_type: 0,
news_type: [],
open_url: ``,
source_name: ``,
tag: [],//["拜仁"],
title: "",
video_url: "",
weight: -3
}

default_data.tag.push( cate );
default_data.introduction = `${one.target.question.title}`;
default_data.title = `${one.target.question.title}`;
default_data.open_url = `https://www.zhihu.com/question/${one.target.question.id}/answer/${one.target.id}`;
if(one.target.thumbnail){
default_data.cover_url.push({
height: 164,
width: 248,
url_type: '图片',
url_name: `${one.target.thumbnail}`

});
}
default_data.source_name = `${source_name}`;
default_data.cover_type = newstype_id;
default_data.news_type.push(newstype_id);
default_data.created_time = one.target.created_time;


if( sign == 0 || (sign < one.target.created_time&&sign!=0) ){
axios.post( `https://屏蔽/management/news`,default_data )
.then(result=>{
console.log(result.data.status_code, result.data.message);
callback(cb_status);
}).catch(err => console.log(`提交出错`,err));
}else{
cb_status.status = 202;
cb_status.msg = `已经存储过`;
callback(cb_status);
}


})(element,callback)//闭包,不闭包的话作用域过大将发生变量污染,你懂的
});



});

}



module.exports = {
getList: _getList
}