1+ # 导入依赖库
2+ import time
3+ import aiohttp
4+ import asyncio
5+ import requests
6+ import os
7+ from faker import Faker
8+
9+ headers = {
10+ 'User-Agent' : Faker ().user_agent ()
11+ }
12+ async def download_image (url ,keyword ,dir_name ):
13+ global count
14+ async with aiohttp .ClientSession () as client :
15+ async with client .get (url ,headers = headers ) as resp :
16+ with open (os .path .join (dir_name , f'{ keyword } _{ count } .jpg' ), 'wb' ) as f :
17+ f .write (await resp .content .read ())
18+ count += 1
19+ print (f'正在下载第 { count } 张图片' )
20+ print (url )
21+
22+ async def download_images (keyword : str , num : int , save_path : str ):
23+ global num_count
24+ """
25+ 爬取百度图片搜索结果中指定关键词keyword的前 num 张图片,并下载到指定文件夹。
26+ :param keyword: 搜索关键词
27+ :param num: 需要下载的图片数量
28+ :param save_path:保存路径
29+ """
30+ # 创建保存图片的文件夹
31+ dir_name = f'{ save_path } /outputs/{ keyword } '
32+ if not os .path .exists (dir_name ):
33+ os .makedirs (dir_name )
34+
35+ count = 0
36+ page_num = 0
37+ thumb_list = []
38+ # 持续爬取图片,直到达到指定数量
39+ while True :
40+ print (f'正在爬取第{ page_num + 1 } 页...' )
41+
42+ # 待请求URL
43+ url = f'https://image.baidu.com/search/acjson?tn=resultjs' \
44+ f'on_com&logid=11513145951136847483&ipn=rj&ct=20132659' \
45+ f'2&is=&fp=result&fr=&word={ keyword } &queryWord={ keyword } &' \
46+ f'cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=&ic=0&hd=&late' \
47+ f'st=©right=&s=&se=&tab=&width=&height=&face=0&istype=2&qc=&' \
48+ f'nc=1&expermode=&nojc=&isAsync=&pn={ page_num * 30 } &rn=30&gsm=5a&1683422786613='
49+
50+ # 模拟请求头
51+ headers = {
52+ 'User-Agent' : Faker ().user_agent ()
53+ }
54+
55+ # 发送 HTTP 请求,获取响应结果并解析 JSON 数据
56+ response = requests .get (url , headers = headers ).json ()
57+
58+
59+ # 遍历所有图片信息
60+ for image_info in response ['data' ]:
61+ try :
62+ # 打印当前正在下载的图片的 URL
63+ thumb_list .append (image_info ['thumbURL' ])
64+ # print(image_info['thumbURL'])
65+ # 下载图片并保存到本地文件夹
66+ # image_data = requests.get(image_info['thumbURL'], headers=headers)
67+ # with open(os.path.join(dir_name, f'{keyword}_{count}.jpg'), 'wb') as f:
68+ # f.write(image_data.content)
69+ num_count += 1
70+
71+ # 如果已经下载了足够数量的图片,则退出爬虫程序
72+ if num_count >= num :
73+ tasks = []
74+ for url in thumb_list :
75+ tasks .append (asyncio .create_task (download_image (url ,keyword ,dir_name )))
76+ await asyncio .wait (tasks )
77+ print (f'一共下载了 { num } 张图片!!!!!!' )
78+ print (f'图片已保存至:{ dir_name } ' )
79+ return
80+
81+ except :
82+ pass
83+ # 增加页数,以爬取下一页的图片
84+ page_num += 1
85+
86+ async def main ():
87+ keyword = input ('请输入关键字:' )
88+ num = eval (input ('请输入数量:' ))
89+ await download_images (keyword , num ,save_path = 'data' )
90+
91+ if __name__ == '__main__' :
92+ num_count = 0
93+ count = 0
94+ start = time .time ()
95+ loop = asyncio .get_event_loop ()
96+ loop .run_until_complete (main ())
97+ end = time .time ()
98+ print (f"总用时{ int (end - start )} s" )
0 commit comments