【python采集】把网站排行榜shipin内容通通采集- Python

前言 ?

大家早好、午好、晚好吖~

需要的开发环境以及模块:

python 3.6
pycharm
requests
re
os

import pprint
import requests # 第三方模块,是需要我们 pip install  requests
import re # 内置模块 是不需要安装

def change_title(title):
    mode = re.compile(r'[\\\/\:\*\?\<\>\|\"]')
    new_title = re.sub(mode, '_', title)
    return new_title



def get_video_url(video_id):
    # format()  'string{}'.format(video_id)
    html_url = f'https://www..com/videoStatus.jsp?contId={video_id}&mrd=0.179849252514223'
    headers_1 = {
        'Referer': f'https: // www..com / video_{video_id}',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'
    }
    response_1 = requests.get(url=html_url, headers=headers_1)
    # pprint.pprint(response_1.json())
    src_url = response_1.json()['videoInfo']['videos']['srcUrl']
    string_1 = '-'.join(src_url.split('-')[1:])
    string_2 = '/'.join(src_url.split('/')[:6])  # https://video..com/mp4/adshorthttps://cdn.jxasp.com:9143/image/20210307/
    # https://video..com/mp4/adshorthttps://cdn.jxasp.com:9143/image/20210307/cont-1722477-15624845_adpkg-ad_hd.mp4  真的视频地址
    # https://video..com/mp4/adshorthttps://cdn.jxasp.com:9143/image/20210307/1615275932043-15624807_adpkg-ad_hd.mp4 假的视频地址
    video_url = string_2 + '/' +  'cont-' + str(video_id) + '-' + string_1
    return video_url

num = 0
for page in range(0, 101, 10):
    num += 1
    print(f'=========================正在爬取第{num}页的视频内容====================')
    url = f'https://www..com/popular_loading.jsp?reqType=1&categoryId=&start={page}&sort=10&mrd=0.5595334619073158'

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'
    }
    response = requests.get(url=url)
    # print(response.text)
    # 解析数据 提取想要内容
    videos = re.findall('<a href="video_(\d+)" class="popularembd actplay">', response.text)
    names = re.findall('<h2 class="popularem-title">(.*?)</h2>', response.text)
    video_data = zip(videos, names)

    for index in video_data:
        name = index[1]
        video_id = index[0]
        video_url = get_video_url(video_id)
        new_title = change_title(name)
        video_content = requests.get(url=video_url).content
        with open('video\\' + new_title + '.mp4', mode='wb') as f:
            f.write(video_content)
            print('正在保存: ', name)
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49