You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
python-project/spider/doutu.py

43 lines
1.4 KiB

#!/usr/bin/env python3
#_*_ coding: utf-8 _*_
'''
***********************************************
authOr: newrain *
blog: https://blog.csdn.net/NewRain_wang *
https://newrain001.gitee.io *
github: https://github.com/newrain001 *
gitee : https://gitee.com/newrain001 *
email : newrain_wang@163.com *
***********************************************
'''
import requests
from lxml import etree
import time
import os
from requests.packages.urllib3.exceptions import InsecureRequestWarning
# 禁用安全请求警告
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
def getUrl():
header = {
'User-Agent': 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36',
'Referer': 'www.doutula.com',
}
html = requests.get('https://www.doutula.com/photo/list', headers=header, verify=False)
if html.status_code == 200:
e = etree.HTML(html.text)
url = e.xpath('//ul/li/div/div/a/img/@data-original')
for i in url:
name = i.split("/")[-1]
result = requests.get(i,headers=header, verify=False)
if not os.path.exists('./image'):
os.mkdir('image')
with open(f'image/{name}', 'wb') as f:
f.write(result.content)
print(name,'已完成')
time.sleep(1)
getUrl()