爬取某站点运行截图如下:
这里使用了阿布云的产品切换代理。
这个产品的文档还是风全的!
推荐大伙使用:
关键代码如下:
记得这个代理可能异常,记得做异常处理。
源码如下:
- import re
- import requests, time
-
- class HandleLaGou(object):
- def __init__(self):
- self.laGou_session = requests.session()
- self.header = {
- 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
- }
- self.city_list = ""
-
- #获取全国城市列表
- def handle_city(self):
- city_search = re.compile(r'zhaopin/">(.*?)</a>')
- city_url = "https://www.lagou.com/jobs/allCity.html"
- city_result = self.handle_request(method = "GET", url = city_url)
- self.city_list = city_search.findall(city_result)
- self.laGou_session.cookies.clear()
-
- def handle_city_job(self, city):
- first_request_url = "https://www.lagou.com/jobs/list_python?city=%s&cl=false&fromSearch=true&labelWords=&suginput=" % city
- first_response = self.handle_request(method = "GET", url = first_request_url)
- total_page_search = re.compile(r'class="span\stotalNum">(\d+)</span>')
- try:
- total_page = total_page_search.search(first_response).group(1)
- except:
- return
- else:
- for i in range(1, int(total_page) + 1):
- data = {
- "pn": i,
- "kd": "python"
- }
- page_url = "https://www.lagou.com/jobs/positionAjax.json?city=%s&needAddtionalResult=false" % city
- referer_url = "https://www.lagou.com/jobs/list_python?city=%s&cl=false&fromSearch=true&labelWords=&suginput=" % city
- self.header['Referer'] = referer_url.encode()
- response = self.handle_request(method = "POST", url = page_url, data = data, info = city)
- print(response)
-
- def handle_request(self, method, url, data = None, info = None):
-
- while True:
- proxyinfo = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {
- "host" : "http-dyn.abuyun.com",
- "port" : 9020,
- "user" : "V21C9SWA4CQ3FSHD",
- "pass" : "1DF3191F6103Q34",
- }
- proxy = {
- "http": proxyinfo,
- "https": proxyinfo
- }
- try:
- if method == "GET":
- response = self.laGou_session.get(url=url, headers=self.header, proxies=proxy,timeout=6)
- return response.text
- elif method == "POST":
- response = self.laGou_session.post(url=url, headers=self.header, data=data, proxies=proxy,timeout=6)
- print(response.text)
- except:
- self.laGou_session.cookies.clear()
- first_request_url = "https://www.lagou.com/jobs/list_python?city=%s&cl=false&fromSearch=true&labelWords=&suginput=" % info
- self.handle_request(method="GET", url=first_request_url)
- time.sleep(10)
- continue
-
- response.encoding = 'utf-8'
- if '频繁' in response.text:
- # 先清除cookies再重新获取cookies
- self.laGou_session.cookies.clear()
- first_request_url = "https://www.lagou.com/jobs/list_python?city=%s&cl=false&fromSearch=true&labelWords=&suginput=" % info
- self.handle_request(method="GET", url=first_request_url)
- time.sleep(10)
- continue
-
- return response.text
-
-
- if __name__ == '__main__':
- laGou = HandleLaGou()
- laGou.handle_city()
-
- for city in laGou.city_list:
- laGou.handle_city_job(city)
- break
-
- pass