新浪微博爬虫

工作需要编写爬虫爬取微博,判断时间戳来决定是否存储,就随手用node写了一个,记录下来分享一下

文件结构

  1. sina_starter.js 爬虫启动器,负责定时执行任务,获取需要爬取的id列表
  2. sina_spider.js 爬虫核心程序,负责爬取数据返回
  3. sina_spider_functions.js 爬虫数据处理,负责url拼接,以及对sina_spider获取的数据筛选存储等

sina_starter.js

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
let schedule = require('node-schedule');
const fs = require("fs");
const axios = require('axios')
const sf = require("./sina_spider_functions");

var rule1 = new schedule.RecurrenceRule();
var times1 = [5,25,45];
rule1.minute = times1;
let time = 1;//心跳次数

function start(ids) {
console.log(`第${time}次心跳`);
time++;
new Promise((resolve, reject) => {
//预留
resolve(ids);
}).then(ids => {
var i = 0;
var timer = setInterval(function () {
sf.getList(ids[i]);
i++;
if (i > ids.length - 1) {
clearInterval(timer)
}
}, 30*1000)//每30秒执行下一个
});
}

schedule.scheduleJob(rule1, () => {
new Promise((resolve,reject)=>{
//获取要抓取的id列表
try {
fs.readFile('./uids.js',{encoding: "utf-8"},function(err,data){
if(err){
console.log("err1");
return false;
}
uids = JSON.parse( data );
resolve( uids );
});
} catch (error) {
console.log( error );
}


}).then((ids)=>{
start(ids);
});

})

sina_spider.js

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
var express = require('express');
const request = require('request');
const cheerio = require(`cheerio`);
const moment = require('moment');
var app = express();

// 微博爬虫接口
// 这个爬虫是早期写的一直也没优化,酌量食用🙂

//抓取微博详情
app.get('/',async function(req, res, next) {
let url = req.query.url?req.query.url:'';
if( !url ){
res.send({
status_code: 405,
message: `域名不正确,当前域名是${url}`
});
return false;
}
await request({
url: url,
headers: {
"User-Agent": "Mozilla/5.0 (Linux; Android 5.0; SM-G900P Build/LRX21T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Mobile Safari/537.36",
"X-Requested-With": "XMLHttpRequest",
"Content-Type": "application/x-www-form-urlencoded"
},
}, function (error, response, body) {
if (!error && response.statusCode == 200) {
$ = cheerio.load(body);
let data_ = $('script:nth-child(2)').html();
// console.log( data_ );
let data = ``;
try {
data = data_.substr( data_.indexOf('var $render_data = [')+`var $render_data = [`.length);
} catch (error) {
return res.send({
status_code: 406,
both: ``,
data: ``,
message: `抓取失败`
});
}
data = data.substr( 0,data.indexOf('[0] || {}')-1 );
data = JSON.parse( data );

let targetData = {
weibo_url: url,
screen_name: data.status.user.screen_name,//昵称
avatar_hd: data.status.user.avatar_hd,//高清头像
imgHost: `https://` + data.status.user.avatar_hd.split( `/` )[2] + `/` + data.status.user.avatar_hd.split( `/` )[3] + `/`,
created_at: null,//new Date( moment(data.status.created_at).format('YYYY-MM-DD HH:mm:ss') ).getTime()/1000, //data.status.created_at,//创建时间,格式 'Thu Sep 26 18:53:21 +0800 2019'
source: data.status.source,//微博来源,如:'专业版微博'
id: data.status.id,//此条微博id
uid: data.status.user.id,//用户id
textLength: data.status.textLength,//正文长度
text: data.status.text,//微博原正文 html
handle_text: [],//处理完的正文, Array
type : '',//媒体类型,目前已知的有video top_topic 还有无此数据,给默认值normal(正文+6图)
edit_count : data.status.edit_count?data.status.edit_count:0,//是否编辑过
profile_url: data.status.user.profile_url,//个人主页,点击头像跳转
pic_ids: [],
pics: [],//高清
media_info: {},
page_url: '',
text_only: '',
}
try {
targetData.created_at = new Date( moment(data.status.created_at).format('YYYY-MM-DD HH:mm:ss') ).getTime()/1000 //,moment.ISO_8601
} catch (error) {
console.log( `转换时间发生错误`);
}

try {
targetData.type = data.status.page_info.type;
} catch (error) {
// console.log( error );
targetData.type = 'normal';
}finally{

}


try {
//pic_ids先 //pics后,高清
if( targetData.type == 'top_topic' ){
targetData.pics = data.status.pics;
targetData.pics.forEach((l,m)=>{
targetData.pic_ids.push( l.large.url );
})
}else if( targetData.type == 'search_topic' ){//search_topic里头可能是九张动图可能是一张图
targetData.pics = data.status.pics;
if( data.status.pics.length > 0 ){
targetData.pics.forEach((l,m)=>{
targetData.pic_ids.push( l.large.url );
})
}else{
targetData.pic_ids.push( data.status.original_pic );//图片列表
}
}else if( targetData.type == 'video' ){
targetData.text_only = data.status.page_info.content2;//微博原正文 纯文本类型
targetData.page_url = data.status.page_info.page_url;//此条微博详情页地址
targetData.media_info = data.status.page_info.media_info;//视频信息(时长/s,标清地址,高清地址))
}else {
targetData.pics = data.status.pics;
if( targetData.pics.length>0 ){
targetData.pics.forEach((l,m)=>{
targetData.pic_ids.push( l.large.url );
})
}else{
targetData.pic_ids = []
}

}
} catch (error) {
console.log(`处理类型发生错误` , error );
}

//处理微博原文
let text_dom;
try {
text_dom = $(targetData.text);
} catch (error) {
text_dom = $(`<div>${targetData.text}</div>`);
}

text_dom.each(function(index,element) {
// console.log( index );
if( $(this).attr('href') ){
targetData.handle_text.push({
type: 'link',
type_cn: '超链接',
href: $(this).text().indexOf(`@`)>-1?encodeURI (`https://m.weibo.cn`+$(this).attr('href') ):$(this).attr('href'),//如果不跳转不需要
text: $(this).text()
});
}else if( $(this).attr('src') ){
targetData.handle_text.push({
type: 'img',
type_cn: '图片',
href: $(this).attr('src')? $(this).attr('src'): ''
});
}else{

if( $(this).text() == '' ){
element.children.forEach(e=>{
if( e.name == 'img' ){
targetData.handle_text.push({
type: 'img',
type_cn: '图片',
src: e.attribs.src?e.attribs.src:'',
alt: e.attribs.alt?e.attribs.alt:'',
style: e.attribs.style?e.attribs.style:'',
});

}
})
if( element.name == 'br' ){
targetData.handle_text.push({
type: 'br',
type_cn: '换行',
// text: $(this).text()
});
}
}else{
targetData.handle_text.push({
type: 'text',
type_cn: '文本',
text: $(this).text()
});
}

}
});
//处理完之后还是空?整个都丢进去
if( !targetData.handle_text || targetData.handle_text.length == 0 ){
targetData.handle_text.push({
type: 'text',
type_cn: '文本',
text: targetData.text
});
}
//处理结束
res.send({
status_code: 200,
both: data,
data: targetData
});
}

})


});
app.listen(8083,'0.0.0.0');


sina_spider_functions.js

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
const axios = require('axios')
const headers = {
"User-Agent": "Mozilla/5.0 (Linux; Android 5.0; SM-G900P Build/LRX21T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Mobile Safari/537.36",
"X-Requested-With": "XMLHttpRequest",
"Content-Type": "application/x-www-form-urlencoded"
}


function _getList(id){
console.log(id);
let cb_status = {
status: 0,
msg: ``
}
let { uid, tags, news_type, news_type_all,description } = id;
let url = `https://m.weibo.cn/profile/info?uid=${uid}`;//列表数据地址
new Promise((resolve,reject)=>{
axios.get(url).then(result=>{
let data = result.data.data;
resolve({ uid, tags, data });
})
}).then( ({ uid, tags, data}) => {
return new Promise((resolve, reject)=>{//获取到消息列表
let url2 = `https://m.weibo.cn/api/container/getIndex?containerid=${data.more.replace('/p/','')}_-_WEIBO_SECOND_PROFILE_WEIBO&page_type=03&page=1`;
resolve( { uid, tags, data, url2 } );
})

}).then(({ uid, tags, data, url2 })=>{
return new Promise((resolve,reject)=>{
axios.get(url2).then(result=>{
let list = [];//利用bid拼接pc接口
result.data.data.cards.forEach(element => {
if( element.card_type == 9 ){
list.push( `https://www.weibo.com/${uid}/${element.mblog.bid}` );
}
});
resolve({ uid, tags, data, url2, list })
});
})
}).then(({ uid, tags, data, url2, list })=>{
//获取最后一篇的时间戳
return new Promise((resolve,reject)=>{
axios.get(`https://屏蔽.com/management/network?uid=${uid}`).then(result=>{
if( result.data.status_code == 200 ){
let created_at = result.data.data.created_at;
resolve({uid, tags, data, url2, list, created_at});
}else{
let created_at = 0;
resolve({uid, tags, data, url2, list, created_at});
}
});

})
}).then(({ uid, tags, data, url2, list, created_at })=>{
//循环拿取列表中的详情
list.forEach((item,index)=>{
(function(uid, tags, data, url2, list,created_at, item ,index){//uid用户id;tags配置里读取的tag;data移动详情;url2移动微博数据列表;list拼接好的pc数据列表;created_at最新的时间戳;item:list子元素;index传个序列来间隔执行
setTimeout(()=>{
//拿取微博正文,注意间隔
axios.get(`https://屏蔽.com/sinaspider/?url=${item}`).then(sina_detail=>{
//执行存储
(function(detail,created_at,tags){
let defaultData = {
"title": "",
"content": {
},
"media_type": 9,
"news_type": [
123
],
"cover_type": 123,
"cover_url": [
],
"video_url": "",
"news_push": false,
"news_address": 0,
"push_message": "",
"news_id": null,
"post_user": "",
"content_url": [
],
"introduction": "",
"is_comment": 1,
"tag": [
"曼联",
"足球"
],
"sub_id": 0,
"weight": -3,
"auditor": 1,
"match_pk": 0
}
let this_created_at = detail.created_at;
// console.log( this_created_at, created_at );
if( !created_at || this_created_at > created_at ){//created_at不存在就全都存
console.log( `---存一条---` );
defaultData.content = detail;
defaultData.tag = tags;
defaultData.news_type = news_type;
defaultData.cover_type = news_type_all[0].id;
defaultData.introduction = `https://屏蔽.com/sinaspider/?url=${item}`;
axios.post(`https://屏蔽.com/management/network`,defaultData).then( response=>{
if( response.data.status_code==200 ){
console.log(`存储成功`);
}
}).catch(function ( err ) {
console.log( err );
});
}

})(sina_detail.data.data,created_at,tags)
});
},7*1000*index)
})(uid, tags, data, url2, list,created_at, item,index)
});
})
}


module.exports = {
getList: _getList
}