关键词搜索

源码搜索 ×
×

Python笔记-使用代理切换ip爬取数据

发布2020-03-13浏览5891次

详情内容

爬取某站点运行截图如下:

这里使用了阿布云的产品切换代理。

这个产品的文档还是风全的!

推荐大伙使用:

关键代码如下:

记得这个代理可能异常,记得做异常处理。

源码如下:

  1. import re
  2. import requests, time
  3. class HandleLaGou(object):
  4. def __init__(self):
  5. self.laGou_session = requests.session()
  6. self.header = {
  7. 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
  8. }
  9. self.city_list = ""
  10. #获取全国城市列表
  11. def handle_city(self):
  12. city_search = re.compile(r'zhaopin/">(.*?)</a>')
  13. city_url = "https://www.lagou.com/jobs/allCity.html"
  14. city_result = self.handle_request(method = "GET", url = city_url)
  15. self.city_list = city_search.findall(city_result)
  16. self.laGou_session.cookies.clear()
  17. def handle_city_job(self, city):
  18. first_request_url = "https://www.lagou.com/jobs/list_python?city=%s&cl=false&fromSearch=true&labelWords=&suginput=" % city
  19. first_response = self.handle_request(method = "GET", url = first_request_url)
  20. total_page_search = re.compile(r'class="span\stotalNum">(\d+)</span>')
  21. try:
  22. total_page = total_page_search.search(first_response).group(1)
  23. except:
  24. return
  25. else:
  26. for i in range(1, int(total_page) + 1):
  27. data = {
  28. "pn": i,
  29. "kd": "python"
  30. }
  31. page_url = "https://www.lagou.com/jobs/positionAjax.json?city=%s&needAddtionalResult=false" % city
  32. referer_url = "https://www.lagou.com/jobs/list_python?city=%s&cl=false&fromSearch=true&labelWords=&suginput=" % city
  33. self.header['Referer'] = referer_url.encode()
  34. response = self.handle_request(method = "POST", url = page_url, data = data, info = city)
  35. print(response)
  36. def handle_request(self, method, url, data = None, info = None):
  37. while True:
  38. proxyinfo = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {
  39. "host" : "http-dyn.abuyun.com",
  40. "port" : 9020,
  41. "user" : "V21C9SWA4CQ3FSHD",
  42. "pass" : "1DF3191F6103Q34",
  43. }
  44. proxy = {
  45. "http": proxyinfo,
  46. "https": proxyinfo
  47. }
  48. try:
  49. if method == "GET":
  50. response = self.laGou_session.get(url=url, headers=self.header, proxies=proxy,timeout=6)
  51. return response.text
  52. elif method == "POST":
  53. response = self.laGou_session.post(url=url, headers=self.header, data=data, proxies=proxy,timeout=6)
  54. print(response.text)
  55. except:
  56. self.laGou_session.cookies.clear()
  57. first_request_url = "https://www.lagou.com/jobs/list_python?city=%s&cl=false&fromSearch=true&labelWords=&suginput=" % info
  58. self.handle_request(method="GET", url=first_request_url)
  59. time.sleep(10)
  60. continue
  61. response.encoding = 'utf-8'
  62. if '频繁' in response.text:
  63. # 先清除cookies再重新获取cookies
  64. self.laGou_session.cookies.clear()
  65. first_request_url = "https://www.lagou.com/jobs/list_python?city=%s&cl=false&fromSearch=true&labelWords=&suginput=" % info
  66. self.handle_request(method="GET", url=first_request_url)
  67. time.sleep(10)
  68. continue
  69. return response.text
  70. if __name__ == '__main__':
  71. laGou = HandleLaGou()
  72. laGou.handle_city()
  73. for city in laGou.city_list:
  74. laGou.handle_city_job(city)
  75. break
  76. pass

 

相关技术文章

点击QQ咨询
开通会员
返回顶部
×
微信扫码支付
微信扫码支付
确定支付下载
请使用微信描二维码支付
×

提示信息

×

选择支付方式

  • 微信支付
  • 支付宝付款
确定支付下载