这个爬虫是18年过年在家无聊写的,具体咋分析已经差不多忘记了,直接上代码了。
这是一份对花瓣网图片爬取的 爬虫,具体就是去花瓣网查找自己想找的图片,然后复制链接就可以运行了。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82
| # -*- coding: utf-8 -*- #data:2018.2.20 import requests import re import os from requests.exceptions import RequestException url = str(input("请输入网址:")) image_url = "http://hbimg.b0.upaiyun.com/" d_next_url = url+'?jdsmifcz&limit=20&wfl=1&max=' l_next_url = url+'&jdtyxl2z&per_page=20&wfl=1&page=' pwd = './huaban/' b = 0 def get_main(url): try: result = requests.get(url) if result.status_code == 200: result.encoding = 'utf-8' return result.text return None except RequestException as r: print("error: ", r) def down_image(filename, url): try: result = requests.get(url) with open(filename, 'wb')as f: for r in result.iter_content(): f.write(r) except Exception as e: print("error: ", e) def get_page(url): global b print('正在请求的网址是:',url) text = get_main(url) find = re.compile(r'app\.page\["pins"\].*') null = None true =True pins = re.findall(find, text) page = eval(pins[0][19:-1]) photo_num = 0 #print(page) for item in page: info = {} info['pin_id'] = item['pin_id'] info['image_url'] =image_url + item['file']['key']+'_fw658' info['like_count'] = item['like_count'] info['repin_count'] = item['repin_count'] if (info['like_count'] > 0 and info['repin_count'] > 2) or info['repin_count'] > 10 or info['like_count'] > 10: print('正在下载第{0}张图片'.format(photo_num)) url_pin_id = info['pin_id'] filename = pwd + str(info['pin_id']) + '.jpg' if os.path.isfile(filename): print('文件已经存在,正在下载下一张 ', filename) continue down_image(filename, str(info['image_url'])) photo_num += 1 while b <= 100: if 'search' not in url: get_page(d_next_url + str(url_pin_id)) else: b += 1 get_page(l_next_url + str(b))
def main(): if not os.path.exists(pwd): os.mkdir(pwd) get_page(url)
if __name__ == '__main__': print( """ By: _ __ _ _ _ | |/ /___ _ _| |__ ___ / \ _ __ __| | | ' // _ \ | | | '_ \ / _ \ / _ \ | '__/ _` | | . \ __/ |_| | |_) | (_) / ___ \| | | (_| | |_|\_\___|\__, |_.__/ \___/_/ \_\_| \__,_| |___/ (适用于Python3.6) """) main()
|