Skip to content

python 爬虫总结

爬虫主要针对b站的资源获取,因为b站的反爬机制还是可以的

一.b站图片的获取

1.通过静态网页img标签获取

第一种方法是,使用\(requests\)模块获取b站网页源代码,通过正则表达式解析出链接(\(img\)标签的解析),再访问这些链接进行图片的储存。

实例:b站主页部分静态图片的获取

import time
import requests
import re
import os
from bs4 import BeautifulSoup


headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0'
                  '.5112.102 Safari/537.36 Edg/104.0.1293.63'
}


def get_html(url_base):
    response = requests.get(url_base, headers=headers)
    html = response.text
    return html


def get_url(html):
    Img = re.compile(r'img.*src="(.*?)"')
    soup = BeautifulSoup(html, "html.parser")
    data = []
    for item in soup.find_all('img'):
        item = str(item)
        pic_list = re.findall(Img, item)
        for t in pic_list:
            data.append(t)
    return data


def save_pic(data, lim):
    cnt = 0
    for i in data:
        if not i.lower().startswith('https:'):
            i = 'https:' + i
        try:
            img = requests.get(i)
            byte = img.content
            with open("image{}.jpg".format(cnt), "wb") as f:
                f.write(byte)
            cnt += 1
            print("downloaded:{}".format(cnt))
        except requests.exceptions.InvalidURL:
            pass
        time.sleep(0.25)
        if cnt >= lim:
            break


def get_image(url_base, target_dir, pic_num):
    if not os.path.exists(target_dir):
        os.mkdir(target_dir)
    html = get_html(url_base)
    data = get_url(html)
    os.chdir(target_dir)
    print("target dir: "+str(os.getcwd()))
    save_pic(data, pic_num)


if __name__ == '__main__':
    url = r'https://www.bilibili.com/'   #input("请输入网址:")
    dirt = r'C:\Users\Yimen\Desktop\bilibili主页_1'   #input("请输入目标文件夹:")
    num = int(input("请输入想要获取的图片数量:"))
    get_image(url, dirt, num)

不过,这种方法只能获取b站部分网页的图片,其他网页,例如B站主页的图片几乎都是用\(js\)渲染出来的,在静态网页中不显示。

2.通过动态网页img标签获取

因此,第二种方法是使用\(seleium\)获取当前网页的动态代码,再获取链接。

这种方法一定能获取图片,因为你看到的图片,都在动态代码中,而\(selenium\)都能够获取下来。但是,\(selenium\)页只能获取已经加载的图片,因此要自动设置将网页下移到底部,如果想要获取多个网页的图片,还需要设置翻页。并且速度也不如\(requests\)快。

实例:获取b站主页的图片

# selenium_get_html.py
from selenium import webdriver
import time


def get_html(url_base):
    driver = webdriver.Edge()
    driver.get(url_base)
    for i in range(0, 10):
        # 控制网页向下滚动1000像素值
        driver.execute_script("window.scrollBy(0,1000)")
        time.sleep(1)
    driver.encoding = 'UTF-8'
    return driver.page_source


if __name__ == '__main__':
    url = r'https://www.bilibili.com/'
# selenium_get_image.py

import time
import requests
import re
import os
from bs4 import BeautifulSoup
import selenium_get_html as sgh


headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0'
                  '.5112.102 Safari/537.36 Edg/104.0.1293.63'
}


def get_html(url_base):
    response = requests.get(url_base, headers=headers)
    html = response.text
    return html


def get_url(html):
    Img = re.compile(r'img.*src="(.*?)"')
    soup = BeautifulSoup(html, "html.parser")
    data = []
    for item in soup.find_all('img'):
        item = str(item)
        pic_list = re.findall(Img, item)
        for t in pic_list:
            data.append(t)
    return data


def save_pic(data, lim):
    cnt = 0
    for i in data:
        if not i.lower().startswith('https:'):
            i = 'https:' + i
        try:
            img = requests.get(i)
            byte = img.content
            with open("image{}.jpg".format(cnt), "wb") as f:
                f.write(byte)
            cnt += 1
            print("downloaded:{}".format(cnt))
        except requests.exceptions.InvalidURL:
            pass
        time.sleep(0.25)
        if cnt >= lim:
            break


def get_image(url_base, target_dir, pic_num):
    if not os.path.exists(target_dir):
        os.mkdir(target_dir)
    html = sgh.get_html(url_base)
    data = get_url(html)
    os.chdir(target_dir)
    print("target dir: "+str(os.getcwd()))
    save_pic(data, pic_num)


if __name__ == '__main__':
    url = r'https://www.bilibili.com/'   #input("请输入网址:")
    dirt = r'C:\Users\Yimen\Desktop\bilibili主页_1'   #input("请输入目标文件夹:")
    num = int(input("请输入想要获取的图片数量:"))
    get_image(url, dirt, num)

3.通过分析"网络"数据

第三种方法,是通过\(F12\)控制台"网络"界面批量获取数据文件的链接,在通过解析这些文件的数据来获取图片链接,进而获取图片。

这种方法需要充分了解网站传输了那些文件,哪个文件中有有用的数据,因此比较烦,但是不需要用\(selenium\)也可以获取大量图片。

实例:获取b站新番榜Top200图片

import requests
import json
import os


headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0'
                  '.5112.102 Safari/537.36 Edg/104.0.1293.63'
}

dirt = r'C:\Users\Yimen\Desktop\新番榜Top200'
front_url = r'https://api.bilibili.com/pgc/season/index/result?season_version=-1&spoken_language_type=-1&area=-1' \
            r'&is_finish=-1&copyright=-1&season_status=-1&season_month=-1&year=-1&style_id=-1&order=3&st=1&sort=0&page='
back_url = r'&season_type=1&pagesize=20&type=1'

if not os.path.exists(dirt):
    os.mkdir(dirt)


def write_json(html, num):
    _dirt = os.path.join(dirt, 'list{}.json'.format(num))
    print('NO.' + str(num) + ' printed')
    with open(_dirt, 'wb') as f:
        f.write(html)


def get_json():
    lst = []
    for i in range(6, 11):
        url = front_url + str(i) + back_url
        try:
            response = requests.get(url, headers=headers)
            html = response.content.decode()
            dic = json.loads(html)
            lst.append(dic)
            # write_json(html, i)
        except requests.exceptions.InvalidURL:
            pass
    return lst


def save_pic(lst):
    os.chdir(dirt)
    num = 0
    for dic in lst:
        _lst = dic["data"]["list"]
        for _dic in _lst:
            num += 1
            pic_url = _dic["cover"]
            pic_name = _dic["title"]  # + "_" + _dic["subTitle"]
            response = requests.get(pic_url, headers=headers)
            pic_byte = response.content
            pic_dirt = str(num) + '_' + pic_name + '.jpg'
            print(pic_dirt + ' printed')
            with open(pic_dirt, 'wb') as f:
                f.write(pic_byte)


if __name__ == '__main__':
    save_pic(get_json())
~~这个实例有点问题,就是如果新番的名称中含有"奇特"的符号,就会报错,不过不显示新番名字,用数字就行了。~~

4.通过分析源代码数据

b站主页的图片还有一种获取方法,就是将源代码中的\(script\)标签中的\(json\)串取出来,分析其中的链接来获取。这里代码省略。

5.总结

爬虫共有两种方法,一种是获取网页源代码并通过解析标签获取链接;一种是通过对网络传输的数据进行分析,从中找到规律并获取有用的链接。获取网页源代码标签又分为静态获取和动态获取。如果网页完全静态,无\(js\)渲染,就可以用\(requests\)获取网页标签。如果网页为动态加载,则需要用到\(selenium\)。而数据分析如果做的好,那么动态网页也可以直接通过\(requests\)访问链接。

二.获取b站弹幕

1.动态获取

通过观察网页源代码,我们发现弹幕也是\(js\)渲染出来的,所以理论上可以用\(selenium\)获取,但是弹幕时刻在变,用\(selenium\)就要把视频全看一遍才能获取完整弹幕。

2.分析数据

通过分析网络传输的数据,我们发现在\(seg.so\)中储存着弹幕的数据,并且所有链接只有\(index\)不同,可以批量获取。但是,\(.so\)文件为二进制文件,很难解码。~~我试了一天,无果。~~这种方法理论上来说可以获取全部弹幕,但是无法解析数据文件,就寄了。

3.奇妙方法

~~通过看网友们的方法~~,我们发现b站一个古老,已废弃的储存弹幕的方式:\(https://comment.bilibili.com/{cid}.xml\),这个链接里存放着该视频部分的弹幕(有时全,有时不全)。而\(cid\)就放在网页源代码里。所以,此法可行。

import re
import requests
import os
import openpyxl
import openpyxl.styles

headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0'
                  '.5112.102 Safari/537.36 Edg/104.0.1293.63'
}
or_url = r'https://www.bilibili.com/video/{}?spm_id_from=333.337.search-card.all.click'
dm_url = r'https://comment.bilibili.com/{}.xml'

bv_id = r'BV1yG4y1r77N'  # input('输入bv号:')
dirt = r'C:\Users\Yimen\Desktop'


def get_html():
    resp = requests.get(or_url.format(bv_id))
    html = resp.text
    with open('danmaku.html', 'w', encoding='utf-8') as f:
        f.write(html)
    return html


def get_cid(html):  #
    tmp = re.compile(r'"videoData":.*?"dynamic":.*?,"cid":(.*?),"dimension":')
    lst = re.findall(tmp, html)
    cid = lst[0]
    return cid


def get_title(html):
    tmp = re.compile(r'"videoData":.*?"pic":.*?,"title":"(.*?)","pubdate":')
    lst = re.findall(tmp, html)
    title = lst[0]
    return title


def write_xml(xml):
    with open('danmaku.xml', 'w', encoding='utf-8') as f:
        f.write(xml)


def get_xml(cid):
    resp = requests.get(dm_url.format(cid))
    xml = resp.content.decode()
    #write_xml(xml)
    return xml


def get_time(tm):
    tm = float(tm)
    min = int(tm / 60)
    sec = int(tm - min * 60)
    smin = str(min)
    ssec = str(sec)
    if min < 10:
        smin = '0' + smin
    if sec < 10:
        ssec = '0' + ssec
    word = smin + ':' + ssec
    return word


def get_color(cl):
    color_val = str(hex(cl)).upper()[2:]
    # print(color_val)
    color_len = len(color_val)
    if color_len < 6:
        for i in range(0, 6 - color_len):
            color_val = "0" + color_val
    if color_val == "FFFFFF":
        color_val = "000000"
    return color_val


def get_danmaku(xml, title):
    tmp = re.compile(r'<d p="(.*?),.*?,.*?,(.*?),.*?">(.*?)</d>')
    tmp_lst = re.findall(tmp, xml)
    lst = []
    for i in tmp_lst:
        lst.append([float(i[0]), int(i[1]), i[2]])
    lst.sort(reverse=False)

    os.chdir(dirt)
    workbook = openpyxl.Workbook()
    sheet = workbook.active
    sheet.title = '弹幕'
    sheet.append(['出现时间', '弹幕内容'])
    num = 1
    sheet.column_dimensions['A'].width = 15
    sheet.column_dimensions['B'].width = 150
    sheet.row_dimensions[num].height = 25

    for tm, cl, dm in lst:
        sheet.append([get_time(tm), dm])
        num += 1
        sheet.row_dimensions[num].height = 20
        pos = 'B' + str(num)
        cell = sheet[pos]
        color_val = get_color(cl)
        cell.font = openpyxl.styles.Font(name="微软雅黑", size=10, bold=True, italic=False, color="" + color_val + "")

    workbook.save("弹幕_" + title + ".xlsx")


if __name__ == '__main__':
    html = get_html()
    cid = get_cid(html)
    title = get_title(html)
    xml = get_xml(cid)
    get_danmaku(xml, title)
另外,这方法获取的弹幕不只有内容,还有时间和颜色等信息,所以用\(excel\)表格呈现这些内容再合适不过。

三.b站视频的获取

~~到了大boss了,当初学爬虫就是为了这个~~

1.省事还是费事

一种省时又省事的方法是用\(you\)-\(get\)获取视频。~~因此,此贴完结~~

2.学还是不学

我们学\(python\)爬虫不是为了省事,而是为了学习这其中的原理。~~要是为了省事,花钱让别人帮自己做岂不是更爽~~

我们发现,网页源代码中的\(script\)标签中就含有视频的信息,不过b站视频分为视频和音频。

因此,我们可以先将\(script\)标签中的\(json\)串取出来,再分析文件得到视频和音频的链接,分别下载即可。

实例:获取b站视频

import os
import re
import requests as req
import json


headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0'
                  '.5112.102 Safari/537.36 Edg/104.0.1293.63'
}

or_url = r'https://www.bilibili.com/video/{}'
ep_url = r'https://www.bilibili.com/bangumi/play/{}?from_spmid=666.25.episode.0&from_outer_spmid=333.337.0.0'
bv_id = r'BV1ua4y1H714?p=2'  # input('输入bv号:')
ep_id = r'ep517946'
dirt = r'C:\Users\Yimen\Desktop\bilibili_video'

if not os.path.exists(dirt):
    os.mkdir(dirt)


def write_html(html):
    with open('video.html', 'w', encoding='utf-8') as f:
        f.write(html)


def get_html(url):
    resp = req.get(url, headers=headers)
    html = resp.content.decode()
    # write_html(html)
    return html


def write_json(json_dic):
    with open('video.json', 'w', encoding='utf-8') as f:
        json.dump(json_dic, f)


def get_json(html):
    tmp = re.compile(r'<script>window.__playinfo__=(.*?)</script>')
    json_text = re.findall(tmp, html)[0]
    json_dic = json.loads(json_text)
    # write_json(json_dic)
    return json_dic


def get_times():
    with open('times.json', 'r') as f:
        dic = json.load(f)
    cnt = dic["times"]
    dic["times"] += 1
    with open('times.json', 'w') as f:
        json.dump(dic, f)
    return cnt


def get_video_clip(dic):
    re_url = or_url.format(bv_id)
    _headers_ = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0'
                      '.5112.102 Safari/537.36 Edg/104.0.1293.63',
        'Referer': re_url
    }

    cnt = get_times()
    os.chdir(dirt)

    url = dic['data']['dash']['video'][0]['base_url']
    print("视频链接获取完毕")
    resp = req.get(url, headers=_headers_)
    print("视频获取完毕")
    vid = resp.content
    print("正在保存视频...")
    with open('video_{}.mp4'.format(cnt), 'wb') as f:
        f.write(vid)
    print("保存完毕!")

    url = dic['data']['dash']['audio'][0]['base_url']
    print("音频链接获取完毕")
    resp = req.get(url, headers=_headers_)
    print("音频获取完毕")
    vid = resp.content
    print("正在保存音频...")
    with open('audio_{}.mp3'.format(cnt), 'wb') as f:
        f.write(vid)
    print("保存完毕!")


def init():
    with open('times.json', 'w') as f:
        json.dump({"times": 0}, f)


if __name__ == '__main__':
    get_video_clip(get_json(get_html(or_url.format(bv_id))))

仅仅获取分开的音频和视频还不够,我们还需要将他们进行合并。所以,我们可以用\(ffmpeg\)实现将音视频合并的操作。当然,使用\(moviepy.editor\)也可以合并音视频,但是速度要远慢于\(ffmpeg\)

注意,\(ffmpeg\)为非系统程序,所以要将绝对路径输入命令行中,不然会报错。

  • 关于\(ffmpeg\)的安装:先将官网下载的\(.zip\)文件解压缩,将文件夹放入任意位置,并在系统变量中添加\(bin\)文件夹的绝对路径。
# merge_video_audio.py
import os
import moviepy.editor as me


def run_by_moviepy(audio_name, video_name, output_name):
    audio = me.AudioFileClip(audio_name)
    video = me.VideoFileClip(video_name)
    output = video.set_audio(audio)
    output.write_videofile(output_name)


def run_by_ffmpeg(audio_name, video_name, output_name):
    cmd = f'C:\\Windows\\ffmpeg-5.1-essentials_build\\bin\\ffmpeg.exe -i {audio_name} -i {video_name} -acodec copy -vcodec copy {output_name}'
    os.system(cmd)
# normal_get_video.py
import os
import re
import requests as req
import json
import merge_video_audio as mva


headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0'
                  '.5112.102 Safari/537.36 Edg/104.0.1293.63'
}

or_url = r'https://www.bilibili.com/video/{}'
ep_url = r'https://www.bilibili.com/bangumi/play/{}?from_spmid=666.25.episode.0&from_outer_spmid=333.337.0.0'
bv_id = r'BV1xa411V7Tq'  # input('输入bv号:')
ep_id = r'ep517946'
dirt = r'C:\Users\Yimen\Desktop\bilibili_video'

if not os.path.exists(dirt):
    os.mkdir(dirt)


def write_html(html):
    with open('video.html', 'w', encoding='utf-8') as f:
        f.write(html)


def get_html(url):
    resp = req.get(url, headers=headers)
    html = resp.content.decode()
    # write_html(html)
    return html


def write_json(json_dic):
    with open('video.json', 'w', encoding='utf-8') as f:
        json.dump(json_dic, f)


def get_json(html):
    tmp = re.compile(r'<script>window.__playinfo__=(.*?)</script>')
    json_text = re.findall(tmp, html)[0]
    json_dic = json.loads(json_text)
    # write_json(json_dic)
    return json_dic


def get_times():
    with open('times.json', 'r') as f:
        dic = json.load(f)
    cnt = dic["times"]
    dic["times"] += 1
    with open('times.json', 'w') as f:
        json.dump(dic, f)
    return cnt


def get_title(html):
    tmp = re.compile(r'"videoData":.*?"pic":.*?,"title":"(.*?)","pubdate":')
    lst = re.findall(tmp, html)
    title = lst[0]
    return title


def get_video_clip(dic):

    re_url = or_url.format(bv_id)
    title = get_title(get_html(re_url))

    _headers_ = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0'
                      '.5112.102 Safari/537.36 Edg/104.0.1293.63',
        'Referer': re_url
    }

    cnt = get_times()
    os.chdir(dirt)

    url = dic['data']['dash']['video'][0]['base_url']
    print("视频链接获取完毕,正在下载视频...")
    resp = req.get(url, headers=_headers_)
    print("视频获取完毕")
    vid = resp.content
    print("正在保存视频...")
    video_name = 'video_{}.mp4'.format(cnt)
    with open(video_name, 'wb') as f:
        f.write(vid)
    print("保存完毕!")
    print("--------------------")
    url = dic['data']['dash']['audio'][0]['base_url']
    print("音频链接获取完毕")
    resp = req.get(url, headers=_headers_)
    print("音频获取完毕")
    vid = resp.content
    print("正在保存音频...")
    audio_name = 'audio_{}.mp3'.format(cnt)
    with open(audio_name, 'wb') as f:
        f.write(vid)
    print("保存完毕!")
    print("--------------------")
    print("正在合并音视频...")

    output_name = 'output_{}.mp4'.format(cnt)
    rename = title + '.mp4'
    mva.run_by_ffmpeg(audio_name, video_name, output_name)
    os.rename(output_name, rename)
    print("合成完毕!")
    print("--------------------")
    print("正在清理数据文件...")
    os.remove(audio_name)
    os.remove(video_name)
    print("清理完毕!")


def init():
    with open('times.json', 'w') as f:
        json.dump({"times": 0}, f)


if __name__ == '__main__':
    get_video_clip(get_json(get_html(or_url.format(bv_id))))

同时,由于通过视频链接获取视频网页源码的过程比较慢,可以做一个进度条来显示当前的下载进度。再存储视频文件时,我们还要注意命名:如果出现重名,要自动生成新名字("_" + 数字);如果名字中含有非法字符,就自动保存为指定的名字("output.mp4")。

# download_process_bar.py
from contextlib import closing
import requests as req

hea = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0'
                  '.5112.102 Safari/537.36 Edg/104.0.1293.63'
}


def download_with_process_bar(url, filename, headers=None):
    if headers is None:
        headers = hea
    with closing(req.get(url, headers=headers, stream=True)) as resp:
        chunk_size = 1024
        content_size = int(resp.headers['content-length'])
        data_count = 0
        with open(filename, 'wb') as f:
            for data in resp.iter_content(chunk_size=chunk_size):
                f.write(data)
                done_block = int((data_count / content_size) * 50)
                data_count = data_count + len(data)
                now_percent = (data_count / content_size) * 100
                print("\r [%s%s] %d%% " % (done_block * '█', ' ' * (50 - 1 - done_block), now_percent), end=" ")
这里的\(closing\)是为了将\(req.get\)自动生成上下文管理,这样才能用\(with\)打开;\(resp.iter\_content\)则可以将文件分段,以显示出进度条。'\r'是为了让输出显示在同一行,作用是让光标回到行首。
# normal_get_video.py
import os
import re
import requests as req
import json
import merge_video_audio as mva
import download_process_bar as dpb
headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0'
                  '.5112.102 Safari/537.36 Edg/104.0.1293.63'
}

or_url = r'https://www.bilibili.com/video/{}'
ep_url = r'https://www.bilibili.com/bangumi/play/{}?from_spmid=666.25.episode.0&from_outer_spmid=333.337.0.0'
bv_id = r'BV1254y187SE'  # input('输入bv号:')
ep_id = r'ep517946'
dirt = r'C:\Users\Yimen\Desktop\bilibili_video'

if not os.path.exists(dirt):
    os.mkdir(dirt)


def write_html(html):
    with open('video.html', 'w', encoding='utf-8') as f:
        f.write(html)


def get_html(url):
    resp = req.get(url, headers=headers)
    html = resp.content.decode()
    # write_html(html)
    return html


def write_json(json_dic):
    with open('video.json', 'w', encoding='utf-8') as f:
        json.dump(json_dic, f)


def get_json(html):
    tmp = re.compile(r'<script>window.__playinfo__=(.*?)</script>')
    json_text = re.findall(tmp, html)[0]
    json_dic = json.loads(json_text)
    # write_json(json_dic)
    return json_dic


def get_times():
    with open('times.json', 'r') as f:
        dic = json.load(f)
    cnt = dic["times"]
    dic["times"] += 1
    with open('times.json', 'w') as f:
        json.dump(dic, f)
    return cnt


def get_title(html):
    tmp = re.compile(r'"videoData":.*?"pic":.*?,"title":"(.*?)","pubdate":')
    lst = re.findall(tmp, html)
    title = lst[0]
    return title


def get_video_clip(dic):
    re_url = or_url.format(bv_id)
    title = get_title(get_html(re_url))

    _headers_ = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0'
                      '.5112.102 Safari/537.36 Edg/104.0.1293.63',
        'Referer': re_url
    }

    cnt = get_times()
    os.chdir(dirt)

    url = dic['data']['dash']['video'][0]['base_url']
    print("视频链接获取完毕,正在下载视频...")
    video_name = 'video_{}.mp4'.format(cnt)
    dpb.download_with_process_bar(url, video_name, _headers_)
    print("下载完毕!")
    print("--------------------")
    url = dic['data']['dash']['audio'][0]['base_url']
    print("音频链接获取完毕")
    resp = req.get(url, headers=_headers_)
    print("音频获取完毕")
    vid = resp.content
    print("正在保存音频...")
    audio_name = 'audio_{}.mp3'.format(cnt)
    with open(audio_name, 'wb') as f:
        f.write(vid)
    print("保存完毕!")
    print("--------------------")
    print("正在合并音视频...")

    output_name = 'output_{}.mp4'.format(cnt)
    rename = title + '.mp4'
    mva.run_by_ffmpeg(audio_name, video_name, output_name)
    num = 0
    while os.path.exists(rename):
        rename = title + '_' + str(num) + '.mp4'
        num += 1
    try:
        os.rename(output_name, rename)
    except OSError:
        print("文件名不正确,文件将自动命名为 " + output_name + '.')
        pass
    print("合成完毕!")
    print("--------------------")
    print("正在清理数据文件...")
    os.remove(audio_name)
    os.remove(video_name)
    print("清理完毕!")


def init():
    with open('times.json', 'w') as f:
        json.dump({"times": 0}, f)


if __name__ == '__main__':
    get_video_clip(get_json(get_html(or_url.format(bv_id))))

四.获取b站评论

b站评论与b站视频的获取方法如出一辙,也是从\(script\)标签中取出\(json\)串,在进行保存。不过,评论具有树形结构,所以需要将每个评论下面的跟评也下载下来。

这个过程存在大量访问不同地址的情况,可能会被拦截。因此为了让程序不终止,需要添加一些特判。并且,我们可以通过随即切换代理\(IP\),请求头的骚操作尝试不被识别为程序。

~~事实上并没什么卵用,200个请求后一定会被拦截~~

实例:获取b站评论

# my_headers.py

import random
my_headers = [
    "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:30.0) Gecko/20100101 Firefox/30.0",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/537.75.14",
    "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Win64; x64; Trident/6.0)",
    'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11',
    'Opera/9.25 (Windows NT 5.1; U; en)',
    'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
    'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)',
    'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12',
    'Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9',
    "Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.7 (KHTML, like Gecko) Ubuntu/11.04 Chromium/16.0.912.77 Chrome/16.0.912.77 Safari/535.7",
    "Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:10.0) Gecko/20100101 Firefox/10.0 ",
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.102 Safari/537.36 Edg/104.0.1293.63'
]


proxy_list = [
    '183.95.80.102:8080',
    '123.160.31.71:8080',
    '115.231.128.79:8080',
    '166.111.77.32:80',
    '43.240.138.31:8080',
    '218.201.98.196:3128',
    '47.94.230.42:9999',
    '192.168.131.1:8080'
]


def headers(origin_url):
    num = random.randint(0, 13)
    hea = {
        'user-agent': my_headers[num],
        'referer': origin_url
    }
    return hea


def proxy():
    num = random.randint(0, 7)
    pro = {
        'http': proxy_list[num]
    }
    return pro


def Proxy():
    prox = ''
    for i in range(0, 4):
        num = random.randint(1, 255)
        if i == 3:
            prox = prox + str(num) + ':'
        else:
            prox = prox + str(num) + '.'
    prox = prox + '8080'
    pro = {
        'http': prox
    }
    return pro
# normal_get_review.py
import os
import requests as req
import json
import re
import openpyxl
import openpyxl.styles
import my_headers as mh


or_url = r'https://api.bilibili.com/x/v2/reply/main?csrf=289e176593e8271d3d55f66616' \
         r'bc8ed0&mode=3&next={}&oid=845534966&plat=1&type=1'

bv_url = r'https://www.bilibili.com/video/BV1Q54y157AT?spm_id_from=333.999.0.' \
         r'0&vd_source=291e2d237373b8b9b036cf97aa555083'

more_rev_url = r'https://api.bilibili.com/x/v2/reply/reply?csrf=289e176593e8271d3d55f66616bc8ed0&oid=84553496' \
               r'6&pn={}&ps=10&root={}&type=1'

dirt = r'C:\Users\Yimen\Desktop\bilibili_review'

if not os.path.exists(dirt):
    os.mkdir(dirt)


def get_title(html):
    tmp = re.compile(r'"videoData":.*?"pic":.*?,"title":"(.*?)","pubdate":')
    lst = re.findall(tmp, html)
    title = lst[0]
    return title


def get_html(url):
    h = mh.headers(bv_url)
    p = mh.Proxy()
    # print(p['http'] + h['user-agent'])
    resp = req.get(url, headers=h, proxies=p)
    html = resp.text
    return html


def write_json(json_dic, cnt):
    obj = json.dumps(json_dic, ensure_ascii=False)
    with open('review{}.json'.format(cnt), 'w', encoding='utf-8') as f:
        f.write(obj)


def get_json(html, cnt):
    json_dic = json.loads(html)
    write_json(json_dic, cnt)
    return json_dic


def write_rev(dic, rev_lst, num):
    name = dic["member"]["uname"]
    if num > 0:
        name = str(num) + '_' + name
    content = dic["content"]["message"]
    rev_lst.append([name, content])
    return dic["rpid"]


def write_review():
    os.chdir(dirt)
    rev_lst = []
    title = get_title(get_html(bv_url))

    for i in range(0, 10):
        now_url = or_url.format(i)
        html = get_html(now_url)
        dic = get_json(html, i)
        lst = dic["data"]["replies"]
        user_num = 1

        for new_dic in lst:
            rpid = write_rev(new_dic, rev_lst, -1)
            cnt = 1
            while True:
                try:
                    more_url = more_rev_url.format(cnt, rpid)
                    tmp_html = get_html(more_url)
                    tmp_dic = get_json(tmp_html, i)
                    state = tmp_dic["message"]
                    if state == "请求被拦截":
                        print(state)
                        break
                    else:
                        tmp_lst = tmp_dic["data"]["replies"]
                        if type(tmp_lst) == type([]):
                            for every_dic in tmp_lst:
                                write_rev(every_dic, rev_lst, user_num)
                        else:
                            break
                except req.exceptions.InvalidURL:
                    break
                cnt += 1
            # print(cnt)
            user_num += 1

    workbook = openpyxl.Workbook()
    sheet = workbook.active
    sheet.title = '评论'
    sheet.append(['用户名称', '评论内容'])
    num = 1
    sheet.column_dimensions['A'].width = 30
    sheet.column_dimensions['B'].width = 300
    sheet.row_dimensions[num].height = 25

    for name, content in rev_lst:
        sheet.append([name, content])
        num += 1
        sheet.row_dimensions[num].height = 20

    workbook.save("评论_1_" + title + ".xlsx")


if __name__ == '__main__':
    write_review()

9.4 update

后来发现,只要在请求之间加个一秒间隔time.sleep(1)就不会被拦截了,乐

五.爬取pixiv 图片

挖个坑,寒假再填

六.纯python的vpn

寒假填坑

update 2023.1.25 寒假填nmb坑,还有50天英语高考了 暑假再填