You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
myblog/App/csdn.py

40 lines
2.0 KiB

import requests, json
from lxml import etree
# from apscheduler.schedulers.background import BackgroundScheduler
# from django_apscheduler.jobstores import DjangoJobStore, register_job, register_events
#
# scheduler = BackgroundScheduler()
# scheduler.add_jobstore(DjangoJobStore(), "default")
def get_image(resolving='300x200',keyword='library'):
a = requests.head(f'https://source.unsplash.com/{resolving}/?{keyword}',headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36'})
image = a.headers['Location']
if image:
return image
else:
return 'https://images.unsplash.com/photo-1567168539593-59673ababaae?crop=entropy&cs=tinysrgb&fit=crop&fm=jpg&h=500&ixid=MnwxfDB8MXxyYW5kb218MHx8Ym9vayxsaWJyYXJ5fHx8fHx8MTY1MTU1NDcyNA&ixlib=rb-1.2.1&q=80&utm_campaign=api-credit&utm_medium=referral&utm_source=unsplash_source&w=500'
# @register_job(scheduler, 'interval', minutes=60*24, replace_existing=True)
def csdn_get():
blog = requests.get('https://blog.csdn.net/NewRain_wang', headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36'})
e = etree.HTML(blog.content)
url = e.xpath('//article[@class="blog-list-box"][count(preceding::article) < 5]/a/@href')[:3]
title = e.xpath('//h4[count(preceding::h4) < 5]/text()')[:3]
text = e.xpath('//article[@class="blog-list-box"][count(preceding::article) < 5]/a/div[@class="blog-list-content"]/text()')[:3]
data = {}
for i in range(len(url)):
data.update({url[i]: {'title': title[i], 'text': text[i],'image': get_image()}})
if data:
#cache.set('csdn', data, 60*60*24)
with open('App/csdn.json', 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=4)
print('csdn.json 已生成')
else:
print('csdn.json 未获取')
# register_events(scheduler)
# scheduler.start()
if __name__ == '__main__':
csdn_get()