环境使用:

Python 3.8
Pycharm 2021.2版本
ffmpeg

模块使用:

import requests >>> pip install requests

内置模块你安装好python环境就可以了

import re
import json
import subprocess

思路分析

一. 数据来源分析

确定自己需求 <采集网站是那个, 获取数据是什么东西>
通过开发者工具进行抓包分析, 分析我们想要数据内容来自于哪里 <通过网页源代码就可以找到相应数据内容>
信息数据在网页源代码里面 playinfo里面
标题

我们想要数据内容都是来自于网页源代码

I.用浏览器打开这个网址
II. 在这个网页上面用鼠标右键点击查看网页源代码会弹出一个新的窗口
III. ctrl + F 打开搜索框搜索playinfo 可以找到相关信息数据
IV. ctrl + F 打开搜索框搜索标题, 也可以找到相关的数据内容

二. 代码实现步骤过程基本四大步骤

发送请求, 模拟浏览器对于url网址发送请求 <专门定义函数来发送请求>
获取数据, 获取网页源代码
解析数据, 提取我们想要数据内容 <信息以及标题>
保存数据, 把内容保存本地

部分代码

import requests  # 数据请求模块 <发送请求工具>
import re  # 正则表达式
import json  # 序列化与反序列
import pprint  # 格式化输出模块
import subprocess
import os


def get_response(html_url, data=None):
    headers = {
        'referer': '解答、完整源码、教程加Q裙：832157862',  # 防盗链 告诉服务器你请求url是从哪里跳转过来的
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.54 Safari/537.36'
    }
    response = requests.get(url=html_url, params=data, headers=headers)
    return response


def get_video_info(play_url):
    # 定义了函数, 一定要调用
    response = get_response(html_url=play_url)  # 调用前面定义好的发送请求函数 函数是可以重复调用
    # print(response.text)  # response.text 获取响应对象文本数据 <获取网页源代码>  字符串数据
    title = re.findall('"title":"(.*?)","pubdate"', response.text)[0].replace(' ', '')  # 标题
    title = re.sub(r'[/\:*?"<>|]', '', title)
    html_data = re.findall('<script>window.__playinfo__=(.*?)</script>', response.text)[0]  # 视频信息
    # print(title)
    # print(html_data)
    # print(type(html_data))
    # 为了方便提取数据, 可以把这个html_data 转成json字典数据
    json_data = json.loads(html_data)
    # print(json_data)
    # print(type(json_data))  # 输出一行
    # 字符串单双引号使用  外面是单引号里面就要使用双引号
    # pprint.pprint(json_data)  # 格式化展开效果
    # 字典取值, 键值对取值 根据冒号左边的内容<键>, 提取冒号右边的内容<值>
    audio_url = json_data['data']['dash']['audio'][0]['baseUrl']
    video_url = json_data['data']['dash']['video'][0]['baseUrl']
    print(audio_url)
    print(video_url)
    video_info = [title, audio_url, video_url]
    return video_info


def save(title, audio_url, video_url):
    audio_content = get_response(html_url=audio_url).content
    video_content = get_response(html_url=video_url).content
    with open('video\\' + title + '.mp3', mode='wb') as f:
        f.write(audio_content)
    with open('video\\' + title + '.mp4', mode='wb') as f:
        f.write(video_content)
    cmd = f"ffmpeg -i video\\{title}.mp4 -i video\\{title}.mp3 -c:v copy -c:a aac -strict experimental video\\{title}output.mp4"
    subprocess.run(cmd, shell=True)
    os.remove(f'video\\{title}.mp4')
    os.remove(f'video\\{title}.mp3')
    print('')
    print(title, '视频下载完成')

def get_search(page, word):
    search_url = '解答、完整源码、教程加Q裙：832157862'
    data = {
        '__refresh__': 'true',
        '_extra': '',
        'context': '',
        'page': page,
        'page_size': '42',
        'from_source': '',
        'from_spmid': '333.337',
        'platform': 'pc',
        'highlight': '1',
        'single_column': '0',
        'keyword': word,
        'category_id': '',
        'search_type': 'video',
        'dynamic_offset': '84',
        'preload': 'true',
        'com2co': 'true',
    }
    json_data = get_response(html_url=search_url, data=data).json()
    bv_list = [i['bvid'] for i in json_data['data']['result']]
    print(bv_list)
    return bv_list


def get_up_video(page, up_id):
    """
    采集up多个视频
    :param page: 采集多少页
    :param up_id: 视频博主ID
    :return:
    """
    up_link = '解答、完整源码、教程加Q裙：832157862'
    data = {
        'mid': up_id,
        'ps': '30',
        'tid': '0',
        'pn': page,
        'keyword': '',
        'order': 'pubdate',
        'jsonp': 'jsonp',
    }
    json_data = get_response(html_url=up_link, data=data).json()
    bv_list = [i['bvid'] for i in json_data['data']['list']['vlist']]
    print(bv_list)
    return bv_list
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103