python 爬虫总结
爬虫主要针对b站的资源获取,因为b站的反爬机制还是可以的
一.b站图片的获取
1.通过静态网页img标签获取
第一种方法是,使用\(requests\)模块获取b站网页源代码,通过正则表达式解析出链接(\(img\)标签的解析),再访问这些链接进行图片的储存。
实例:b站主页部分静态图片的获取
import time
import requests
import re
import os
from bs4 import BeautifulSoup
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0'
'.5112.102 Safari/537.36 Edg/104.0.1293.63'
}
def get_html(url_base):
response = requests.get(url_base, headers=headers)
html = response.text
return html
def get_url(html):
Img = re.compile(r'img.*src="(.*?)"')
soup = BeautifulSoup(html, "html.parser")
data = []
for item in soup.find_all('img'):
item = str(item)
pic_list = re.findall(Img, item)
for t in pic_list:
data.append(t)
return data
def save_pic(data, lim):
cnt = 0
for i in data:
if not i.lower().startswith('https:'):
i = 'https:' + i
try:
img = requests.get(i)
byte = img.content
with open("image{}.jpg".format(cnt), "wb") as f:
f.write(byte)
cnt += 1
print("downloaded:{}".format(cnt))
except requests.exceptions.InvalidURL:
pass
time.sleep(0.25)
if cnt >= lim:
break
def get_image(url_base, target_dir, pic_num):
if not os.path.exists(target_dir):
os.mkdir(target_dir)
html = get_html(url_base)
data = get_url(html)
os.chdir(target_dir)
print("target dir: "+str(os.getcwd()))
save_pic(data, pic_num)
if __name__ == '__main__':
url = r'https://www.bilibili.com/' #input("请输入网址:")
dirt = r'C:\Users\Yimen\Desktop\bilibili主页_1' #input("请输入目标文件夹:")
num = int(input("请输入想要获取的图片数量:"))
get_image(url, dirt, num)
不过,这种方法只能获取b站部分网页的图片,其他网页,例如B站主页的图片几乎都是用\(js\)渲染出来的,在静态网页中不显示。
2.通过动态网页img标签获取
因此,第二种方法是使用\(seleium\)获取当前网页的动态代码,再获取链接。
这种方法一定能获取图片,因为你看到的图片,都在动态代码中,而\(selenium\)都能够获取下来。但是,\(selenium\)页只能获取已经加载的图片,因此要自动设置将网页下移到底部,如果想要获取多个网页的图片,还需要设置翻页。并且速度也不如\(requests\)快。
实例:获取b站主页的图片
# selenium_get_html.py
from selenium import webdriver
import time
def get_html(url_base):
driver = webdriver.Edge()
driver.get(url_base)
for i in range(0, 10):
# 控制网页向下滚动1000像素值
driver.execute_script("window.scrollBy(0,1000)")
time.sleep(1)
driver.encoding = 'UTF-8'
return driver.page_source
if __name__ == '__main__':
url = r'https://www.bilibili.com/'
# selenium_get_image.py
import time
import requests
import re
import os
from bs4 import BeautifulSoup
import selenium_get_html as sgh
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0'
'.5112.102 Safari/537.36 Edg/104.0.1293.63'
}
def get_html(url_base):
response = requests.get(url_base, headers=headers)
html = response.text
return html
def get_url(html):
Img = re.compile(r'img.*src="(.*?)"')
soup = BeautifulSoup(html, "html.parser")
data = []
for item in soup.find_all('img'):
item = str(item)
pic_list = re.findall(Img, item)
for t in pic_list:
data.append(t)
return data
def save_pic(data, lim):
cnt = 0
for i in data:
if not i.lower().startswith('https:'):
i = 'https:' + i
try:
img = requests.get(i)
byte = img.content
with open("image{}.jpg".format(cnt), "wb") as f:
f.write(byte)
cnt += 1
print("downloaded:{}".format(cnt))
except requests.exceptions.InvalidURL:
pass
time.sleep(0.25)
if cnt >= lim:
break
def get_image(url_base, target_dir, pic_num):
if not os.path.exists(target_dir):
os.mkdir(target_dir)
html = sgh.get_html(url_base)
data = get_url(html)
os.chdir(target_dir)
print("target dir: "+str(os.getcwd()))
save_pic(data, pic_num)
if __name__ == '__main__':
url = r'https://www.bilibili.com/' #input("请输入网址:")
dirt = r'C:\Users\Yimen\Desktop\bilibili主页_1' #input("请输入目标文件夹:")
num = int(input("请输入想要获取的图片数量:"))
get_image(url, dirt, num)
3.通过分析"网络"数据
第三种方法,是通过\(F12\)控制台"网络"界面批量获取数据文件的链接,在通过解析这些文件的数据来获取图片链接,进而获取图片。
这种方法需要充分了解网站传输了那些文件,哪个文件中有有用的数据,因此比较烦,但是不需要用\(selenium\)也可以获取大量图片。
实例:获取b站新番榜Top200图片
import requests
import json
import os
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0'
'.5112.102 Safari/537.36 Edg/104.0.1293.63'
}
dirt = r'C:\Users\Yimen\Desktop\新番榜Top200'
front_url = r'https://api.bilibili.com/pgc/season/index/result?season_version=-1&spoken_language_type=-1&area=-1' \
r'&is_finish=-1©right=-1&season_status=-1&season_month=-1&year=-1&style_id=-1&order=3&st=1&sort=0&page='
back_url = r'&season_type=1&pagesize=20&type=1'
if not os.path.exists(dirt):
os.mkdir(dirt)
def write_json(html, num):
_dirt = os.path.join(dirt, 'list{}.json'.format(num))
print('NO.' + str(num) + ' printed')
with open(_dirt, 'wb') as f:
f.write(html)
def get_json():
lst = []
for i in range(6, 11):
url = front_url + str(i) + back_url
try:
response = requests.get(url, headers=headers)
html = response.content.decode()
dic = json.loads(html)
lst.append(dic)
# write_json(html, i)
except requests.exceptions.InvalidURL:
pass
return lst
def save_pic(lst):
os.chdir(dirt)
num = 0
for dic in lst:
_lst = dic["data"]["list"]
for _dic in _lst:
num += 1
pic_url = _dic["cover"]
pic_name = _dic["title"] # + "_" + _dic["subTitle"]
response = requests.get(pic_url, headers=headers)
pic_byte = response.content
pic_dirt = str(num) + '_' + pic_name + '.jpg'
print(pic_dirt + ' printed')
with open(pic_dirt, 'wb') as f:
f.write(pic_byte)
if __name__ == '__main__':
save_pic(get_json())
4.通过分析源代码数据
b站主页的图片还有一种获取方法,就是将源代码中的\(script\)标签中的\(json\)串取出来,分析其中的链接来获取。这里代码省略。
5.总结
爬虫共有两种方法,一种是获取网页源代码并通过解析标签获取链接;一种是通过对网络传输的数据进行分析,从中找到规律并获取有用的链接。获取网页源代码标签又分为静态获取和动态获取。如果网页完全静态,无\(js\)渲染,就可以用\(requests\)获取网页标签。如果网页为动态加载,则需要用到\(selenium\)。而数据分析如果做的好,那么动态网页也可以直接通过\(requests\)访问链接。
二.获取b站弹幕
1.动态获取
通过观察网页源代码,我们发现弹幕也是\(js\)渲染出来的,所以理论上可以用\(selenium\)获取,但是弹幕时刻在变,用\(selenium\)就要把视频全看一遍才能获取完整弹幕。
2.分析数据
通过分析网络传输的数据,我们发现在\(seg.so\)中储存着弹幕的数据,并且所有链接只有\(index\)不同,可以批量获取。但是,\(.so\)文件为二进制文件,很难解码。~~我试了一天,无果。~~这种方法理论上来说可以获取全部弹幕,但是无法解析数据文件,就寄了。
3.奇妙方法
~~通过看网友们的方法~~,我们发现b站一个古老,已废弃的储存弹幕的方式:\(https://comment.bilibili.com/{cid}.xml\),这个链接里存放着该视频部分的弹幕(有时全,有时不全)。而\(cid\)就放在网页源代码里。所以,此法可行。
import re
import requests
import os
import openpyxl
import openpyxl.styles
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0'
'.5112.102 Safari/537.36 Edg/104.0.1293.63'
}
or_url = r'https://www.bilibili.com/video/{}?spm_id_from=333.337.search-card.all.click'
dm_url = r'https://comment.bilibili.com/{}.xml'
bv_id = r'BV1yG4y1r77N' # input('输入bv号:')
dirt = r'C:\Users\Yimen\Desktop'
def get_html():
resp = requests.get(or_url.format(bv_id))
html = resp.text
with open('danmaku.html', 'w', encoding='utf-8') as f:
f.write(html)
return html
def get_cid(html): #
tmp = re.compile(r'"videoData":.*?"dynamic":.*?,"cid":(.*?),"dimension":')
lst = re.findall(tmp, html)
cid = lst[0]
return cid
def get_title(html):
tmp = re.compile(r'"videoData":.*?"pic":.*?,"title":"(.*?)","pubdate":')
lst = re.findall(tmp, html)
title = lst[0]
return title
def write_xml(xml):
with open('danmaku.xml', 'w', encoding='utf-8') as f:
f.write(xml)
def get_xml(cid):
resp = requests.get(dm_url.format(cid))
xml = resp.content.decode()
#write_xml(xml)
return xml
def get_time(tm):
tm = float(tm)
min = int(tm / 60)
sec = int(tm - min * 60)
smin = str(min)
ssec = str(sec)
if min < 10:
smin = '0' + smin
if sec < 10:
ssec = '0' + ssec
word = smin + ':' + ssec
return word
def get_color(cl):
color_val = str(hex(cl)).upper()[2:]
# print(color_val)
color_len = len(color_val)
if color_len < 6:
for i in range(0, 6 - color_len):
color_val = "0" + color_val
if color_val == "FFFFFF":
color_val = "000000"
return color_val
def get_danmaku(xml, title):
tmp = re.compile(r'<d p="(.*?),.*?,.*?,(.*?),.*?">(.*?)</d>')
tmp_lst = re.findall(tmp, xml)
lst = []
for i in tmp_lst:
lst.append([float(i[0]), int(i[1]), i[2]])
lst.sort(reverse=False)
os.chdir(dirt)
workbook = openpyxl.Workbook()
sheet = workbook.active
sheet.title = '弹幕'
sheet.append(['出现时间', '弹幕内容'])
num = 1
sheet.column_dimensions['A'].width = 15
sheet.column_dimensions['B'].width = 150
sheet.row_dimensions[num].height = 25
for tm, cl, dm in lst:
sheet.append([get_time(tm), dm])
num += 1
sheet.row_dimensions[num].height = 20
pos = 'B' + str(num)
cell = sheet[pos]
color_val = get_color(cl)
cell.font = openpyxl.styles.Font(name="微软雅黑", size=10, bold=True, italic=False, color="" + color_val + "")
workbook.save("弹幕_" + title + ".xlsx")
if __name__ == '__main__':
html = get_html()
cid = get_cid(html)
title = get_title(html)
xml = get_xml(cid)
get_danmaku(xml, title)
三.b站视频的获取
~~到了大boss了,当初学爬虫就是为了这个~~
1.省事还是费事
一种省时又省事的方法是用\(you\)-\(get\)获取视频。~~因此,此贴完结~~
2.学还是不学
我们学\(python\)爬虫不是为了省事,而是为了学习这其中的原理。~~要是为了省事,花钱让别人帮自己做岂不是更爽~~
我们发现,网页源代码中的\(script\)标签中就含有视频的信息,不过b站视频分为视频和音频。
因此,我们可以先将\(script\)标签中的\(json\)串取出来,再分析文件得到视频和音频的链接,分别下载即可。
实例:获取b站视频
import os
import re
import requests as req
import json
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0'
'.5112.102 Safari/537.36 Edg/104.0.1293.63'
}
or_url = r'https://www.bilibili.com/video/{}'
ep_url = r'https://www.bilibili.com/bangumi/play/{}?from_spmid=666.25.episode.0&from_outer_spmid=333.337.0.0'
bv_id = r'BV1ua4y1H714?p=2' # input('输入bv号:')
ep_id = r'ep517946'
dirt = r'C:\Users\Yimen\Desktop\bilibili_video'
if not os.path.exists(dirt):
os.mkdir(dirt)
def write_html(html):
with open('video.html', 'w', encoding='utf-8') as f:
f.write(html)
def get_html(url):
resp = req.get(url, headers=headers)
html = resp.content.decode()
# write_html(html)
return html
def write_json(json_dic):
with open('video.json', 'w', encoding='utf-8') as f:
json.dump(json_dic, f)
def get_json(html):
tmp = re.compile(r'<script>window.__playinfo__=(.*?)</script>')
json_text = re.findall(tmp, html)[0]
json_dic = json.loads(json_text)
# write_json(json_dic)
return json_dic
def get_times():
with open('times.json', 'r') as f:
dic = json.load(f)
cnt = dic["times"]
dic["times"] += 1
with open('times.json', 'w') as f:
json.dump(dic, f)
return cnt
def get_video_clip(dic):
re_url = or_url.format(bv_id)
_headers_ = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0'
'.5112.102 Safari/537.36 Edg/104.0.1293.63',
'Referer': re_url
}
cnt = get_times()
os.chdir(dirt)
url = dic['data']['dash']['video'][0]['base_url']
print("视频链接获取完毕")
resp = req.get(url, headers=_headers_)
print("视频获取完毕")
vid = resp.content
print("正在保存视频...")
with open('video_{}.mp4'.format(cnt), 'wb') as f:
f.write(vid)
print("保存完毕!")
url = dic['data']['dash']['audio'][0]['base_url']
print("音频链接获取完毕")
resp = req.get(url, headers=_headers_)
print("音频获取完毕")
vid = resp.content
print("正在保存音频...")
with open('audio_{}.mp3'.format(cnt), 'wb') as f:
f.write(vid)
print("保存完毕!")
def init():
with open('times.json', 'w') as f:
json.dump({"times": 0}, f)
if __name__ == '__main__':
get_video_clip(get_json(get_html(or_url.format(bv_id))))
仅仅获取分开的音频和视频还不够,我们还需要将他们进行合并。所以,我们可以用\(ffmpeg\)实现将音视频合并的操作。当然,使用\(moviepy.editor\)也可以合并音视频,但是速度要远慢于\(ffmpeg\)。
注意,\(ffmpeg\)为非系统程序,所以要将绝对路径输入命令行中,不然会报错。
- 关于\(ffmpeg\)的安装:先将官网下载的\(.zip\)文件解压缩,将文件夹放入任意位置,并在系统变量中添加\(bin\)文件夹的绝对路径。
# merge_video_audio.py
import os
import moviepy.editor as me
def run_by_moviepy(audio_name, video_name, output_name):
audio = me.AudioFileClip(audio_name)
video = me.VideoFileClip(video_name)
output = video.set_audio(audio)
output.write_videofile(output_name)
def run_by_ffmpeg(audio_name, video_name, output_name):
cmd = f'C:\\Windows\\ffmpeg-5.1-essentials_build\\bin\\ffmpeg.exe -i {audio_name} -i {video_name} -acodec copy -vcodec copy {output_name}'
os.system(cmd)
# normal_get_video.py
import os
import re
import requests as req
import json
import merge_video_audio as mva
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0'
'.5112.102 Safari/537.36 Edg/104.0.1293.63'
}
or_url = r'https://www.bilibili.com/video/{}'
ep_url = r'https://www.bilibili.com/bangumi/play/{}?from_spmid=666.25.episode.0&from_outer_spmid=333.337.0.0'
bv_id = r'BV1xa411V7Tq' # input('输入bv号:')
ep_id = r'ep517946'
dirt = r'C:\Users\Yimen\Desktop\bilibili_video'
if not os.path.exists(dirt):
os.mkdir(dirt)
def write_html(html):
with open('video.html', 'w', encoding='utf-8') as f:
f.write(html)
def get_html(url):
resp = req.get(url, headers=headers)
html = resp.content.decode()
# write_html(html)
return html
def write_json(json_dic):
with open('video.json', 'w', encoding='utf-8') as f:
json.dump(json_dic, f)
def get_json(html):
tmp = re.compile(r'<script>window.__playinfo__=(.*?)</script>')
json_text = re.findall(tmp, html)[0]
json_dic = json.loads(json_text)
# write_json(json_dic)
return json_dic
def get_times():
with open('times.json', 'r') as f:
dic = json.load(f)
cnt = dic["times"]
dic["times"] += 1
with open('times.json', 'w') as f:
json.dump(dic, f)
return cnt
def get_title(html):
tmp = re.compile(r'"videoData":.*?"pic":.*?,"title":"(.*?)","pubdate":')
lst = re.findall(tmp, html)
title = lst[0]
return title
def get_video_clip(dic):
re_url = or_url.format(bv_id)
title = get_title(get_html(re_url))
_headers_ = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0'
'.5112.102 Safari/537.36 Edg/104.0.1293.63',
'Referer': re_url
}
cnt = get_times()
os.chdir(dirt)
url = dic['data']['dash']['video'][0]['base_url']
print("视频链接获取完毕,正在下载视频...")
resp = req.get(url, headers=_headers_)
print("视频获取完毕")
vid = resp.content
print("正在保存视频...")
video_name = 'video_{}.mp4'.format(cnt)
with open(video_name, 'wb') as f:
f.write(vid)
print("保存完毕!")
print("--------------------")
url = dic['data']['dash']['audio'][0]['base_url']
print("音频链接获取完毕")
resp = req.get(url, headers=_headers_)
print("音频获取完毕")
vid = resp.content
print("正在保存音频...")
audio_name = 'audio_{}.mp3'.format(cnt)
with open(audio_name, 'wb') as f:
f.write(vid)
print("保存完毕!")
print("--------------------")
print("正在合并音视频...")
output_name = 'output_{}.mp4'.format(cnt)
rename = title + '.mp4'
mva.run_by_ffmpeg(audio_name, video_name, output_name)
os.rename(output_name, rename)
print("合成完毕!")
print("--------------------")
print("正在清理数据文件...")
os.remove(audio_name)
os.remove(video_name)
print("清理完毕!")
def init():
with open('times.json', 'w') as f:
json.dump({"times": 0}, f)
if __name__ == '__main__':
get_video_clip(get_json(get_html(or_url.format(bv_id))))
同时,由于通过视频链接获取视频网页源码的过程比较慢,可以做一个进度条来显示当前的下载进度。再存储视频文件时,我们还要注意命名:如果出现重名,要自动生成新名字("_" + 数字);如果名字中含有非法字符,就自动保存为指定的名字("output.mp4")。
# download_process_bar.py
from contextlib import closing
import requests as req
hea = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0'
'.5112.102 Safari/537.36 Edg/104.0.1293.63'
}
def download_with_process_bar(url, filename, headers=None):
if headers is None:
headers = hea
with closing(req.get(url, headers=headers, stream=True)) as resp:
chunk_size = 1024
content_size = int(resp.headers['content-length'])
data_count = 0
with open(filename, 'wb') as f:
for data in resp.iter_content(chunk_size=chunk_size):
f.write(data)
done_block = int((data_count / content_size) * 50)
data_count = data_count + len(data)
now_percent = (data_count / content_size) * 100
print("\r [%s%s] %d%% " % (done_block * '█', ' ' * (50 - 1 - done_block), now_percent), end=" ")
# normal_get_video.py
import os
import re
import requests as req
import json
import merge_video_audio as mva
import download_process_bar as dpb
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0'
'.5112.102 Safari/537.36 Edg/104.0.1293.63'
}
or_url = r'https://www.bilibili.com/video/{}'
ep_url = r'https://www.bilibili.com/bangumi/play/{}?from_spmid=666.25.episode.0&from_outer_spmid=333.337.0.0'
bv_id = r'BV1254y187SE' # input('输入bv号:')
ep_id = r'ep517946'
dirt = r'C:\Users\Yimen\Desktop\bilibili_video'
if not os.path.exists(dirt):
os.mkdir(dirt)
def write_html(html):
with open('video.html', 'w', encoding='utf-8') as f:
f.write(html)
def get_html(url):
resp = req.get(url, headers=headers)
html = resp.content.decode()
# write_html(html)
return html
def write_json(json_dic):
with open('video.json', 'w', encoding='utf-8') as f:
json.dump(json_dic, f)
def get_json(html):
tmp = re.compile(r'<script>window.__playinfo__=(.*?)</script>')
json_text = re.findall(tmp, html)[0]
json_dic = json.loads(json_text)
# write_json(json_dic)
return json_dic
def get_times():
with open('times.json', 'r') as f:
dic = json.load(f)
cnt = dic["times"]
dic["times"] += 1
with open('times.json', 'w') as f:
json.dump(dic, f)
return cnt
def get_title(html):
tmp = re.compile(r'"videoData":.*?"pic":.*?,"title":"(.*?)","pubdate":')
lst = re.findall(tmp, html)
title = lst[0]
return title
def get_video_clip(dic):
re_url = or_url.format(bv_id)
title = get_title(get_html(re_url))
_headers_ = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0'
'.5112.102 Safari/537.36 Edg/104.0.1293.63',
'Referer': re_url
}
cnt = get_times()
os.chdir(dirt)
url = dic['data']['dash']['video'][0]['base_url']
print("视频链接获取完毕,正在下载视频...")
video_name = 'video_{}.mp4'.format(cnt)
dpb.download_with_process_bar(url, video_name, _headers_)
print("下载完毕!")
print("--------------------")
url = dic['data']['dash']['audio'][0]['base_url']
print("音频链接获取完毕")
resp = req.get(url, headers=_headers_)
print("音频获取完毕")
vid = resp.content
print("正在保存音频...")
audio_name = 'audio_{}.mp3'.format(cnt)
with open(audio_name, 'wb') as f:
f.write(vid)
print("保存完毕!")
print("--------------------")
print("正在合并音视频...")
output_name = 'output_{}.mp4'.format(cnt)
rename = title + '.mp4'
mva.run_by_ffmpeg(audio_name, video_name, output_name)
num = 0
while os.path.exists(rename):
rename = title + '_' + str(num) + '.mp4'
num += 1
try:
os.rename(output_name, rename)
except OSError:
print("文件名不正确,文件将自动命名为 " + output_name + '.')
pass
print("合成完毕!")
print("--------------------")
print("正在清理数据文件...")
os.remove(audio_name)
os.remove(video_name)
print("清理完毕!")
def init():
with open('times.json', 'w') as f:
json.dump({"times": 0}, f)
if __name__ == '__main__':
get_video_clip(get_json(get_html(or_url.format(bv_id))))
四.获取b站评论
b站评论与b站视频的获取方法如出一辙,也是从\(script\)标签中取出\(json\)串,在进行保存。不过,评论具有树形结构,所以需要将每个评论下面的跟评也下载下来。
这个过程存在大量访问不同地址的情况,可能会被拦截。因此为了让程序不终止,需要添加一些特判。并且,我们可以通过随即切换代理\(IP\),请求头的骚操作尝试不被识别为程序。
~~事实上并没什么卵用,200个请求后一定会被拦截~~
实例:获取b站评论
# my_headers.py
import random
my_headers = [
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:30.0) Gecko/20100101 Firefox/30.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/537.75.14",
"Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Win64; x64; Trident/6.0)",
'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11',
'Opera/9.25 (Windows NT 5.1; U; en)',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)',
'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12',
'Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9',
"Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.7 (KHTML, like Gecko) Ubuntu/11.04 Chromium/16.0.912.77 Chrome/16.0.912.77 Safari/535.7",
"Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:10.0) Gecko/20100101 Firefox/10.0 ",
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.102 Safari/537.36 Edg/104.0.1293.63'
]
proxy_list = [
'183.95.80.102:8080',
'123.160.31.71:8080',
'115.231.128.79:8080',
'166.111.77.32:80',
'43.240.138.31:8080',
'218.201.98.196:3128',
'47.94.230.42:9999',
'192.168.131.1:8080'
]
def headers(origin_url):
num = random.randint(0, 13)
hea = {
'user-agent': my_headers[num],
'referer': origin_url
}
return hea
def proxy():
num = random.randint(0, 7)
pro = {
'http': proxy_list[num]
}
return pro
def Proxy():
prox = ''
for i in range(0, 4):
num = random.randint(1, 255)
if i == 3:
prox = prox + str(num) + ':'
else:
prox = prox + str(num) + '.'
prox = prox + '8080'
pro = {
'http': prox
}
return pro
# normal_get_review.py
import os
import requests as req
import json
import re
import openpyxl
import openpyxl.styles
import my_headers as mh
or_url = r'https://api.bilibili.com/x/v2/reply/main?csrf=289e176593e8271d3d55f66616' \
r'bc8ed0&mode=3&next={}&oid=845534966&plat=1&type=1'
bv_url = r'https://www.bilibili.com/video/BV1Q54y157AT?spm_id_from=333.999.0.' \
r'0&vd_source=291e2d237373b8b9b036cf97aa555083'
more_rev_url = r'https://api.bilibili.com/x/v2/reply/reply?csrf=289e176593e8271d3d55f66616bc8ed0&oid=84553496' \
r'6&pn={}&ps=10&root={}&type=1'
dirt = r'C:\Users\Yimen\Desktop\bilibili_review'
if not os.path.exists(dirt):
os.mkdir(dirt)
def get_title(html):
tmp = re.compile(r'"videoData":.*?"pic":.*?,"title":"(.*?)","pubdate":')
lst = re.findall(tmp, html)
title = lst[0]
return title
def get_html(url):
h = mh.headers(bv_url)
p = mh.Proxy()
# print(p['http'] + h['user-agent'])
resp = req.get(url, headers=h, proxies=p)
html = resp.text
return html
def write_json(json_dic, cnt):
obj = json.dumps(json_dic, ensure_ascii=False)
with open('review{}.json'.format(cnt), 'w', encoding='utf-8') as f:
f.write(obj)
def get_json(html, cnt):
json_dic = json.loads(html)
write_json(json_dic, cnt)
return json_dic
def write_rev(dic, rev_lst, num):
name = dic["member"]["uname"]
if num > 0:
name = str(num) + '_' + name
content = dic["content"]["message"]
rev_lst.append([name, content])
return dic["rpid"]
def write_review():
os.chdir(dirt)
rev_lst = []
title = get_title(get_html(bv_url))
for i in range(0, 10):
now_url = or_url.format(i)
html = get_html(now_url)
dic = get_json(html, i)
lst = dic["data"]["replies"]
user_num = 1
for new_dic in lst:
rpid = write_rev(new_dic, rev_lst, -1)
cnt = 1
while True:
try:
more_url = more_rev_url.format(cnt, rpid)
tmp_html = get_html(more_url)
tmp_dic = get_json(tmp_html, i)
state = tmp_dic["message"]
if state == "请求被拦截":
print(state)
break
else:
tmp_lst = tmp_dic["data"]["replies"]
if type(tmp_lst) == type([]):
for every_dic in tmp_lst:
write_rev(every_dic, rev_lst, user_num)
else:
break
except req.exceptions.InvalidURL:
break
cnt += 1
# print(cnt)
user_num += 1
workbook = openpyxl.Workbook()
sheet = workbook.active
sheet.title = '评论'
sheet.append(['用户名称', '评论内容'])
num = 1
sheet.column_dimensions['A'].width = 30
sheet.column_dimensions['B'].width = 300
sheet.row_dimensions[num].height = 25
for name, content in rev_lst:
sheet.append([name, content])
num += 1
sheet.row_dimensions[num].height = 20
workbook.save("评论_1_" + title + ".xlsx")
if __name__ == '__main__':
write_review()
9.4 update
后来发现,只要在请求之间加个一秒间隔time.sleep(1)
就不会被拦截了,乐
五.爬取pixiv 图片
挖个坑,寒假再填
六.纯python的vpn
寒假填坑
update 2023.1.25 寒假填nmb坑,还有50天英语高考了 暑假再填