微博爬虫实例
你需要先了解…
关于此文章…
一. 帖子
懒得写,贴个源码。
爬不了的话,传入你的cookie
import requests # 网络请求
import random # 与time控制网络请求频率
import time # 与random控制网络请求频率
import json # 响应的json数据分析
from tqdm import tqdm # 进度条
# 由此函数完成 请求-响应解析 的过程
# 接收评论json文件的url,解析json文件并返回评论信息字典(max,total_number,max_id,max_id_type,commentlist[])
def getCommentTnfo(commenturl):
# 请求评论json文件
# print(commenturl)
headers = {
"cookie": "",
'user-agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Mobile Safari/537.36 Edg/116.0.1938.76'
}
response = requests.get(commenturl, headers=headers)
# 解析json文件内容:max_id,max_id_type,commentlist[]
# max,total_number是第一次请求要获取的信息
dic = {
'max': 0,
'total_number': 0,
'max_id': 0,
'max_id_type': 0,
'comment_list': []
}
comment_json = json.loads(response.text)
if (comment_json["ok"] == 1):
data = comment_json["data"]
dic["max"] = data["max"]
dic["total_number"] = data["total_number"]
dic["max_id"] = data["max_id"]
dic["max_id_type"] = data["max_id_type"]
list = data["data"]
for li in list:
user = li["user"]
name = user["screen_name"]
gender = user["gender"]
source = li["source"]
text = li["text"]
# print(name + "(" + gender + "|" + source + ")" + ":" + text)
dic["comment_list"].append(name + "(" + gender + "|" + source + ")" + ":" + text)
return dic
# 如果不写else,那么python默认函数返回None
else:
return dic
# 此函数调用getCommentList函数,输入不同的max_id
# 输入文章url,返回评论信息
def getcomments():
blog = input("请输入帖子的url:")
id = blog.split('/')[-1]
mid = blog.split('/')[-1]
max_id_type = 0
comment_list = []
print("正在分析中...")
comment_url = "https://m.weibo.cn/comments/hotflow?id=" + id + "&mid=" + mid + "&max_id_type=" + str(max_id_type)
dic = getCommentTnfo(comment_url)
# max,total_number第一次请求时赋值,分析有多少页和多少评论
max = dic["max"]
total_number = dic["total_number"]
max_id = dic["max_id"]
max_id_type = dic["max_id_type"]
comment_list = comment_list + dic["comment_list"]
print("查询到" + str(max) + "页内容,共计" + str(total_number) + "条评论")
print("正在爬取中...")
for i in tqdm(range(max-1)):
# for i in range(max - 1):
time.sleep(random.uniform(1.0, 3.0))
comment_url = "https://m.weibo.cn/comments/hotflow?id=" + id + "&mid=" + mid + "&max_id=" + str(max_id) + "&max_id_type=" + str(max_id_type)
if(dic["max_id"]!=0):
dic = getCommentTnfo(comment_url)
max_id = dic["max_id"]
max_id_type = dic["max_id_type"]
comment_list = comment_list + dic["comment_list"]
# 是否预览
preview = input("共" + str(len(comment_list)) + "条记录,预览(y/n):")
if (preview == "y" or preview == "Y"):
for comment in comment_list:
print(comment)
# 是否保存
save = input("共" + str(len(comment_list)) + "条记录,保存(y/n):")
if (save == "y" or save == "Y"):
print("正在写入中...")
with open("comment.txt", "w", encoding="utf-8") as f:
for comment in tqdm(comment_list):
f.writelines(comment + "\n")
if __name__ == '__main__':
while (True):
print("-------微博评论爬虫-------")
print(" by liangyue")
getcomments()
二. 评论
import requests # 网络请求
import random # 与time控制网络请求频率
import time # 与random控制网络请求频率
import json # 响应的json数据分析
from tqdm import tqdm # 进度条
def analyse(blogurl):
# 请求&响应
headers = {
'user-agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Mobile Safari/537.36 Edg/116.0.1938.76'
}
response = requests.get(blogurl, headers=headers)
# 响应解析
list_dic_blog = []
response_json = json.loads(response.text)
if (response_json["ok"] == 1):
data = response_json["data"]
cards = data["cards"]
for card in cards:
dic_blog = {
"user_name": "",
"user_gender": "",
"user_region": "",
"blog_text": "",
"blog_url": ""
}
if "mblog" in card:
myblog = card["mblog"]
user = myblog["user"]
dic_blog["user_name"] = user["screen_name"]
dic_blog["user_gender"] = user["gender"]
if ("status_province" in myblog):
dic_blog["user_region"] = myblog["status_province"]
mid = myblog["mid"]
dic_blog["blog_text"] = myblog["text"]
dic_blog["blog_url"] = "https://m.weibo.cn/detail/" + mid
list_dic_blog.append(dic_blog)
if "card_group" in card:
card_grop = card["card_group"]
if "mblog" in card_grop[0]:
myblog = (card_grop[0])["mblog"]
user = myblog["user"]
dic_blog["user_name"] = user["screen_name"]
dic_blog["user_gender"] = user["gender"]
if ("status_province" in myblog):
dic_blog["user_region"] = myblog["status_province"]
mid = myblog["mid"]
dic_blog["blog_text"] = myblog["text"]
dic_blog["blog_url"] = "https://m.weibo.cn/detail/" + mid
list_dic_blog.append(dic_blog)
return list_dic_blog
else:
return list_dic_blog
def getblog():
keyword = input("请输入关键字:")
page = input("请输入爬取页码(1-100):")
page = int(page)
if (page < 1 or page > 100):
exit(418)
print("正在爬取中...")
list_dic_blog = []
for i in tqdm(range(page)):
if (page == 0):
blogurl = "https://m.weibo.cn/api/container/getIndex?containerid=100103type%3D1%26q%3D" + keyword + "&page_type=searchall"
list_dic_blog = list_dic_blog + analyse(blogurl)
else:
time.sleep(random.uniform(1, 2))
blogurl = "https://m.weibo.cn/api/container/getIndex?containerid=100103type%3D1%26q%3D" + keyword + "&page_type=searchall" + "&page=" + str(
i)
list_dic_blog = list_dic_blog + analyse(blogurl)
# 是否预览?
preview = input("共" + str(len(list_dic_blog)) + "条记录,你是否希望预览(y/n):")
if (preview == "y" or preview == "Y"):
for dic_blog in list_dic_blog:
print(dic_blog)
# 是否保存?
save = input("共" + str(len(list_dic_blog)) + "条记录,保存到blog.txt(y/n):")
if (save == "y" or save == "Y"):
with open("blog.txt", "w", encoding='Utf-8') as f:
for dic_blog in tqdm(list_dic_blog):
f.writelines(str(dic_blog) + "\n")
if __name__ == '__main__':
print("-------微博帖子-------")
print(" By liangyue")
while(True):
getblog()
本博客所有文章除特别声明外,均采用 CC BY-NC-SA 4.0 许可协议。转载请注明来自 良月的小窝!
评论
ValineDisqus