Skip to content

Commit ce40f8c

Browse files
committed
add asyncio
1 parent 91faf59 commit ce40f8c

File tree

1 file changed

+98
-0
lines changed

1 file changed

+98
-0
lines changed

utils/crawler_img_11.py

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
# 导入依赖库
2+
import time
3+
import aiohttp
4+
import asyncio
5+
import requests
6+
import os
7+
from faker import Faker
8+
9+
headers = {
10+
'User-Agent': Faker().user_agent()
11+
}
12+
async def download_image(url,keyword,dir_name):
13+
global count
14+
async with aiohttp.ClientSession() as client:
15+
async with client.get(url,headers=headers) as resp:
16+
with open(os.path.join(dir_name, f'{keyword}_{count}.jpg'), 'wb') as f:
17+
f.write(await resp.content.read())
18+
count += 1
19+
print(f'正在下载第 {count} 张图片')
20+
print(url)
21+
22+
async def download_images(keyword: str, num: int, save_path: str):
23+
global num_count
24+
"""
25+
爬取百度图片搜索结果中指定关键词keyword的前 num 张图片,并下载到指定文件夹。
26+
:param keyword: 搜索关键词
27+
:param num: 需要下载的图片数量
28+
:param save_path:保存路径
29+
"""
30+
# 创建保存图片的文件夹
31+
dir_name = f'{save_path}/outputs/{keyword}'
32+
if not os.path.exists(dir_name):
33+
os.makedirs(dir_name)
34+
35+
count = 0
36+
page_num = 0
37+
thumb_list = []
38+
# 持续爬取图片,直到达到指定数量
39+
while True:
40+
print(f'正在爬取第{page_num + 1}页...')
41+
42+
# 待请求URL
43+
url = f'https://image.baidu.com/search/acjson?tn=resultjs' \
44+
f'on_com&logid=11513145951136847483&ipn=rj&ct=20132659' \
45+
f'2&is=&fp=result&fr=&word={keyword}&queryWord={keyword}&' \
46+
f'cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=&ic=0&hd=&late' \
47+
f'st=&copyright=&s=&se=&tab=&width=&height=&face=0&istype=2&qc=&' \
48+
f'nc=1&expermode=&nojc=&isAsync=&pn={page_num * 30}&rn=30&gsm=5a&1683422786613='
49+
50+
# 模拟请求头
51+
headers = {
52+
'User-Agent': Faker().user_agent()
53+
}
54+
55+
# 发送 HTTP 请求,获取响应结果并解析 JSON 数据
56+
response = requests.get(url, headers=headers).json()
57+
58+
59+
# 遍历所有图片信息
60+
for image_info in response['data']:
61+
try:
62+
# 打印当前正在下载的图片的 URL
63+
thumb_list.append(image_info['thumbURL'])
64+
# print(image_info['thumbURL'])
65+
# 下载图片并保存到本地文件夹
66+
# image_data = requests.get(image_info['thumbURL'], headers=headers)
67+
# with open(os.path.join(dir_name, f'{keyword}_{count}.jpg'), 'wb') as f:
68+
# f.write(image_data.content)
69+
num_count += 1
70+
71+
# 如果已经下载了足够数量的图片,则退出爬虫程序
72+
if num_count >= num:
73+
tasks = []
74+
for url in thumb_list:
75+
tasks.append(asyncio.create_task(download_image(url,keyword,dir_name)))
76+
await asyncio.wait(tasks)
77+
print(f'一共下载了 {num} 张图片!!!!!!')
78+
print(f'图片已保存至:{dir_name}')
79+
return
80+
81+
except:
82+
pass
83+
# 增加页数,以爬取下一页的图片
84+
page_num += 1
85+
86+
async def main():
87+
keyword = input('请输入关键字:')
88+
num = eval(input('请输入数量:'))
89+
await download_images(keyword, num,save_path='data')
90+
91+
if __name__ == '__main__':
92+
num_count = 0
93+
count = 0
94+
start = time.time()
95+
loop = asyncio.get_event_loop()
96+
loop.run_until_complete(main())
97+
end = time.time()
98+
print(f"总用时{int(end - start)}s")

0 commit comments

Comments
 (0)