作者 | 周萝卜
来源 |萝卜大杂烩
https://comment.bilibili.com/xxxx.xml
defgetHTML_content(self):
#获取该视频网页的内容
response=requests.get(self.BVurl,headers=self.headers)
html_str=response.content.decode()
html=etree.HTML(html_str)
result=etree.tostring(html)
returnresult
defget_script_list(self,str):
html=etree.HTML(str)
script_list=html.xpath("//script/text()")
returnscript_list
script_list=self.get_script_list(html_content)
#解析script数据,获取cid信息
forscriptinscript_list:
if'[{"cid":'inscript:
find_script_text=script
final_text=find_script_text.split('[{"cid":')[1].split(',"page":')[0]
spider=BiliSpider("BV16p4y187hc")
spider.run()
defget_data(data):
data_list=[]
comment_data_list=data["data"]["replies"]
foriincomment_data_list:
data_list.append([i['rpid'],i['like'],i['member']['uname'],i['member']['level_info']['current_level'],i['content']['message']])
returndata_list
defsave_data(data_type,data):
ifnotos.path.exists(data_type+r'_data.csv'):
withopen(data_type+r"_data.csv","a+",encoding='utf-8')asf:
f.write("rpid,点赞数量,用户,等级,评论内容\n")
foriindata:
rpid=i[0]
like_count=i[1]
user=i[2].replace(',',',')
level=i[3]
content=i[4].replace(',',',')
row='{},{},{},{},{}'.format(rpid,like_count,user,level,content)
f.write(row)
f.write('\n')
else:
withopen(data_type+r"_data.csv","a+",encoding='utf-8')asf:
foriindata:
rpid=i[0]
like_count=i[1]
user=i[2].replace(',',',')
level=i[3]
content=i[4].replace(',',',')
row='{},{},{},{},{}'.format(rpid,like_count,user,level,content)
f.write(row)
f.write('\n')
foriinrange(1000):
url="https://api.bilibili.com/x/v2/reply/main?jsonp=jsonp&next={}&type=1&oid=972516426&mode=3&plat=1&_=1632192192097".format(str(i))
print(url)
d=requests.get(url)
data=d.json()
ifnotdata['data']['replies']:
break
m_data=get_data(data)
save_data("main",m_data)
forjinm_data:
reply_url="https://api.bilibili.com/x/v2/reply/reply?jsonp=jsonp&pn=1&type=1&oid=972516426&ps=10&root={}&_=1632192668665".format(str(j[0]))
print(reply_url)
r=requests.get(reply_url)
r_data=r.json()
ifnotr_data['data']['replies']:
break
reply_data=get_data(r_data)
save_data("reply",reply_data)
time.sleep(5)
time.sleep(5)
df_new=df.dropna(axis=0,subset=["用户"])
df1=df.sort_values(by="点赞数量",ascending=False).head(20)
c1=(
Bar()
.add_xaxis(df1["评论内容"].to_list())
.add_yaxis("点赞数量",df1["点赞数量"].to_list(),color=Faker.rand_color())
.set_global_opts(
title_opts=opts.TitleOpts(title="评论热度Top20"),
datazoom_opts=[opts.DataZoomOpts(),opts.DataZoomOpts(type_="inside")],
)
.render_notebook()
)
pie_data=df_new.等级.value_counts().sort_index(ascending=False)
pie_data.tolist()
c2=(
Pie()
.add(
"",
[list(z)forzinzip([str(i)foriinrange(6,1,-1)],pie_data.tolist())],
radius=["40%","75%"],
)
.set_global_opts(
title_opts=opts.TitleOpts(title="等级分布"),
legend_opts=opts.LegendOpts(orient="vertical",pos_top="15%",pos_left="2%"),
)
.set_series_opts(label_opts=opts.LabelOpts(formatter="{b}:{c}"))
.render_notebook()
)
defwordcloud(data,name,pic=None):
comment=jieba.cut(str(data),cut_all=False)
words=''.join(comment)
img=Image.open(pic)
img_array=np.array(img)
wc=WordCloud(width=2000,height=1800,background_color='white',font_path=font,mask=img_array,
stopwords=STOPWORDS,contour_width=3,contour_color='steelblue')
wc.generate(words)
wc.to_file(name+'.png')
wordcloud(df_new["评论内容"],"冰冰",'1.PNG')
分享
点收藏
点点赞
点在看
文章转发自AI科技大本营微信公众号,版权归其所有。文章内容不代表本站立场和任何投资暗示。
Copyright © 2021.Company 元宇宙YITB.COM All rights reserved.元宇宙YITB.COM