import requests, json from lxml import etree # from apscheduler.schedulers.background import BackgroundScheduler # from django_apscheduler.jobstores import DjangoJobStore, register_job, register_events # # scheduler = BackgroundScheduler() # scheduler.add_jobstore(DjangoJobStore(), "default") def get_image(resolving='300x200',keyword='library'): a = requests.head(f'https://source.unsplash.com/{resolving}/?{keyword}',headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36'}) image = a.headers['Location'] if image: return image else: return 'https://images.unsplash.com/photo-1567168539593-59673ababaae?crop=entropy&cs=tinysrgb&fit=crop&fm=jpg&h=500&ixid=MnwxfDB8MXxyYW5kb218MHx8Ym9vayxsaWJyYXJ5fHx8fHx8MTY1MTU1NDcyNA&ixlib=rb-1.2.1&q=80&utm_campaign=api-credit&utm_medium=referral&utm_source=unsplash_source&w=500' # @register_job(scheduler, 'interval', minutes=60*24, replace_existing=True) def csdn_get(): blog = requests.get('https://blog.csdn.net/NewRain_wang', headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36'}) e = etree.HTML(blog.content) url = e.xpath('//article[@class="blog-list-box"][count(preceding::article) < 5]/a/@href')[:3] title = e.xpath('//h4[count(preceding::h4) < 5]/text()')[:3] text = e.xpath('//article[@class="blog-list-box"][count(preceding::article) < 5]/a/div[@class="blog-list-content"]/text()')[:3] data = {} for i in range(len(url)): data.update({url[i]: {'title': title[i], 'text': text[i],'image': get_image()}}) if data: #cache.set('csdn', data, 60*60*24) with open('App/csdn.json', 'w', encoding='utf-8') as f: json.dump(data, f, ensure_ascii=False, indent=4) print('csdn.json 已生成') else: print('csdn.json 未获取') # register_events(scheduler) # scheduler.start() if __name__ == '__main__': csdn_get()