利用python对b站评论进行数据分析

首先,把代码打在下面

import json
import time
import requests
from icecream import ic
import openpyxl as op
import pandas as pd
from stylecloud import gen_stylecloud
import jieba
import math
from openpyxl.cell.cell import ILLEGAL_CHARACTERS_RE

input_bv = input("请输入Bv号:")
input_comment = int(input("请输入评论数", ))


def BvToAv(Bv):
    # 1.去除Bv号前的"Bv"字符
    BvNo1 = Bv[2:]
    keys = {
        '1': '13', '2': '12', '3': '46', '4': '31', '5': '43', '6': '18', '7': '40', '8': '28', '9': '5',
        'A': '54', 'B': '20', 'C': '15', 'D': '8', 'E': '39', 'F': '57', 'G': '45', 'H': '36', 'J': '38', 'K': '51',
        'L': '42', 'M': '49', 'N': '52', 'P': '53', 'Q': '7', 'R': '4', 'S': '9', 'T': '50', 'U': '10', 'V': '44',
        'W': '34', 'X': '6', 'Y': '25', 'Z': '1',
        'a': '26', 'b': '29', 'c': '56', 'd': '3', 'e': '24', 'f': '0', 'g': '47', 'h': '27', 'i': '22', 'j': '41',
        'k': '16', 'm': '11', 'n': '37', 'o': '2',
        'p': '35', 'q': '21', 'r': '17', 's': '33', 't': '30', 'u': '48', 'v': '23', 'w': '55', 'x': '32', 'y': '14',
        'z': '19'

    }
    # 2. 将key对应的value存入一个列表
    BvNo2 = []
    for index, ch in enumerate(BvNo1):
        BvNo2.append(int(str(keys[ch])))

    # 3. 对列表中不同位置的数进行*58的x次方的操作

    BvNo2[0] = int(BvNo2[0] * math.pow(58, 6))
    BvNo2[1] = int(BvNo2[1] * math.pow(58, 2))
    BvNo2[2] = int(BvNo2[2] * math.pow(58, 4))
    BvNo2[3] = int(BvNo2[3] * math.pow(58, 8))
    BvNo2[4] = int(BvNo2[4] * math.pow(58, 5))
    BvNo2[5] = int(BvNo2[5] * math.pow(58, 9))
    BvNo2[6] = int(BvNo2[6] * math.pow(58, 3))
    BvNo2[7] = int(BvNo2[7] * math.pow(58, 7))
    BvNo2[8] = int(BvNo2[8] * math.pow(58, 1))
    BvNo2[9] = int(BvNo2[9] * math.pow(58, 0))

    # 4.求出这10个数的合
    sum = 0
    for i in BvNo2:
        sum += i
    # 5. 将和减去100618342136696320
    sum -= 100618342136696320
    # 6. 将sum 与177451812进行异或
    temp = 177451812

    return sum ^ temp


ws = op.Workbook()
wb = ws.create_sheet(index=0)
wb.cell(row=1, column=1, value='评论者')
wb.cell(row=1, column=2, value='性别')
wb.cell(row=1, column=3, value='评论时间')
wb.cell(row=1, column=4, value='点赞人数')
wb.cell(row=1, column=5, value='评论内容')
count = 2
pages = int(input_comment / 20)
for page in range(1, pages + 2):
    if page % 10 == 0:
        time.sleep(2)
    print(f'-----------------正在爬取第{page}页数据-----------------')
    time_thick = int(time.time() * 1000)
    url = f'https://api.bilibili.com/x/v2/reply/main?callback=jQuery172009047692616139114_{1627891325400 + page}&jsonp=jsonp&next={page}&type=1&oid={BvToAv(input_bv)}&mode=3&plat=1&_={time_thick}'
    headers = {
        "cookie": "_uuid=2C423E32-D756-42BF-837A-D9EE37444BB551183infoc; buvid3=7A7881EA-BC3F-47AA-A1AB-29037A31339B143081infoc; sid=cp2sahe4; fingerprint=5202fd503f203b18ff5ce39bb5501174; buvid_fp=7A7881EA-BC3F-47AA-A1AB-29037A31339B143081infoc; buvid_fp_plain=7A7881EA-BC3F-47AA-A1AB-29037A31339B143081infoc; DedeUserID=505650288; DedeUserID__ckMd5=f9f2a4a188ea2b77; SESSDATA=cecd7605%2C1643084091%2Ce20d4*71; bili_jct=85a9dfc70bd4895cb2dba29207bdb57f; CURRENT_FNVAL=80; blackside_state=1; rpdid=|(m~YkRYu~u0J'uYk~lJkuum; CURRENT_QUALITY=80; bp_video_offset_505650288=553101546167882251; PVID=6",
        "referer": "https://www.bilibili.com/",
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36 Edg/92.0.902.67"
    }
    proxies = {"HTTPS": "47.98.183.59:3128"}
    response = requests.get(url, headers=headers, proxies=proxies)
    if response.status_code == requests.codes.ok:
        text = response.text[42:-1]
        # 转换json格式
        json_data = json.loads(text)
        # 获取所有评论
        datas = json_data['data']['replies']
        ic(datas)
        if datas:  # 判断是否为none(迭代none会导致报错)
            for item in datas:
                # 评论者
                name = item['member']['uname']
                # 性别
                sex = item['member']['sex']
                # 评论时间
                ctime = item.get('ctime')
                content_time = time.strftime('%Y-%m-%d %H:%M', time.localtime(ctime))
                # 点赞人数
                star = item['like']
                # 评论内容
                cmts = item['content']['message']
                cmts = ILLEGAL_CHARACTERS_RE.sub(r'', cmts)
                wb.cell(row=count, column=1, value=name)
                wb.cell(row=count, column=2, value=sex)
                wb.cell(row=count, column=3, value=content_time)
                wb.cell(row=count, column=4, value=star)
                wb.cell(row=count, column=5, value=cmts, )
                count += 1
                ic(name, sex, content_time, star, cmts)
ws.save('bilibili.xlsx')
##词云处理
rcv_data = pd.read_excel('bilibili.xlsx')
exist_col = rcv_data.dropna()  # 删除空行
c_title = exist_col['评论内容'].tolist()
# 评论词云图
wordlist = jieba.cut(''.join(c_title))
result = ' '.join(wordlist)
picture = 'bilibili.jpg'
stopwords_list = ['啊', '的', '也', '了', '这', '那', '是', '吗', '都', '就', '你', '我', '他', '她', '他们', '不', '真的', '有', '没有',
                  '还有', '说', '哈', 'doge', '在']
gen_stylecloud(text=result,
               icon_name='fas fa-dog',
               font_path='msyh.ttc',
               background_color='white',
               output_name=picture,
               custom_stopwords=stopwords_list
               )
print("succesful!")

下面是对本项目的简单介绍,首先将bv号进行转换,再通过b站的api获取数据,pandas写入表格,最后词云进行绘图。

© 版权声明
THE END
喜欢就支持以下吧
点赞7
分享