1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
| import requests, queue, os, urllib3, time, threading
from hashlib import md5
urllib3.disable_warnings() # 忽略ssl报错
def getManyPages(keyword, pages):
params = {
'tn': 'resultjson_com',
'ipn': 'rj',
'ct': 201326592,
'is': '',
'fp': 'result',
'queryWord': keyword,
'cl': 2,
'lm': -1,
'ie': 'utf-8',
'oe': 'utf-8',
'adpicid': '',
'st': -1,
'z': '',
'ic': 0,
'word': keyword,
's': '',
'se': '',
'tab': '',
'width': '',
'height': '',
'face': 0,
'istype': 2,
'qc': '',
'nc': 1,
'fr': '',
'pn': pages, # 起始数
'rn': 50, # 返回数
'gsm': '1e',
'1532325785686': '' # 根据下图修改,这只是一个时间戳
}
url = 'https://image.baidu.com/search/acjson' # 百度图片网址
try:
res = requests.get(url, params=params, timeout=(5, 20), verify=False)
return res.json().get('data')
except:
print('百度图片json数据获取失败')
return []
def Run():
global Dnum
while not Plist.empty():
data2 = Plist.get()
picpath = FilePath + data2['filename']
try:
res = requests.get(data2['url'], timeout=(5, 30), stream=True)
if res.status_code == requests.codes.ok and int(res.headers['Content-Length']) > 0:
destr = (str(res.headers['Content-Length']) + data2['url']).encode(encoding='UTF-8')
m = md5(destr).hexdigest()
# --------------------------
if m in Mlist: # 重复重复跳过!~~
print('唯一表示符:' + m + ' ,已重复跳过!~~')
continue
else:
Mlist.append(m)
# --------------------------
with open(picpath, 'wb') as file:
file.write(res.content)
except requests.RequestException as err:
Plist.put(data2)
print('下载失败,稍后重试!' + err)
continue
if __name__ == '__main__':
# ------------------------------------------
KeyWords = '白色轿车' # 关键词
Num = 2 # 爬取数量,50的倍数
Tnum = 100 # 下载线程数
# ------------------------------------------
Name = 0 # 顺序命名
Tlist = list() # 子线程列表
Mlist = list() # 图片唯一表示符
Plist = queue.Queue(Num * 100) # 图片下载地址队列
FilePath = './baidu/' + KeyWords + '/' # 保存路径,使用相对路径
Start = time.time()
if not os.path.exists(FilePath):
os.makedirs(FilePath)
for xx in range(Num):
dataList = getManyPages(KeyWords, xx * 50)
for x in dataList:
if 'thumbURL' in x:
filename = 'baidu_' + KeyWords + '_' + str(Name) + '.jpg'
data = {'filename': filename, 'url': x['thumbURL']}
Plist.put(data)
Name += 1
print('当前已获取图片数:' + str(Plist.qsize()) + ' ' + str(xx))
print('实际获取数量可能略少,实际获取数量:' + str(Plist.qsize()))
print('开始启动下载线程')
for x in range(Tnum):
t = threading.Thread(target=Run)
t.setDaemon(False)
t.start()
Tlist.append(t)
print('所有线程启动完成,全力下载')
while not Plist.empty() and threading.activeCount() > 1:
time.sleep(1)
print('当前线程总数:' + str(threading.activeCount()) + ' 剩余任务数:' + str(Plist.qsize()) + ' 执行耗时:' + str(
round(time.time() - Start)) + 'S')
for x in range(len(Tlist)):
if not Tlist[x].isAlive() and not Plist.empty():
print('其中一个线程死掉,马上重启')
Tlist.pop(x)
t = threading.Thread(target=Run)
t.setDaemon(False)
t.start()
Tlist.append(t)
print('全部下载完毕,下载需求数:' + str(Num * 50) + ',实际有效下载数:' + str(len(Mlist)) + ',执行总耗时:' + str(
round(time.time() - Start)) + ' S')
|