Featured image of post 【日常Py】百度图片关键词多线程爬虫

【日常Py】百度图片关键词多线程爬虫

前几天朋友需要大量各种关键词的图片,于是帮她:smiley: 写了一个关于图片的多线程爬虫,爬的是百度图片,同时换了接口,也适用于搜狗图片,360图片;除此之外,还加了一个简单的去重。

脚本用python3写的,用的都是一些基础模块,没有使用框架之类的,反正我也不会,这些模块也已经满足要求。使用过程中某些参数需要自己配置,比如关键词,线程数量等,如果你什么都不会,那不推荐你使用。


爬虫效果:

![2.png](/usr/uploads/2019/01/3074433381.png) ![1.png](/usr/uploads/2019/01/2294687552.png)

百度图片代码:

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import requests, queue, os, urllib3, time, threading
from hashlib import md5

urllib3.disable_warnings()  # 忽略ssl报错


def getManyPages(keyword, pages):
    params = {
        'tn': 'resultjson_com',
        'ipn': 'rj',
        'ct': 201326592,
        'is': '',
        'fp': 'result',
        'queryWord': keyword,
        'cl': 2,
        'lm': -1,
        'ie': 'utf-8',
        'oe': 'utf-8',
        'adpicid': '',
        'st': -1,
        'z': '',
        'ic': 0,
        'word': keyword,
        's': '',
        'se': '',
        'tab': '',
        'width': '',
        'height': '',
        'face': 0,
        'istype': 2,
        'qc': '',
        'nc': 1,
        'fr': '',
        'pn': pages,  # 起始数
        'rn': 50,  # 返回数
        'gsm': '1e',
        '1532325785686': ''  # 根据下图修改,这只是一个时间戳
    }
    url = 'https://image.baidu.com/search/acjson'  # 百度图片网址
    try:
        res = requests.get(url, params=params, timeout=(5, 20), verify=False)
        return res.json().get('data')
    except:
        print('百度图片json数据获取失败')
        return []


def Run():
    global Dnum
    while not Plist.empty():
        data2 = Plist.get()
        picpath = FilePath + data2['filename']
        try:
            res = requests.get(data2['url'], timeout=(5, 30), stream=True)
            if res.status_code == requests.codes.ok and int(res.headers['Content-Length']) > 0:
                destr = (str(res.headers['Content-Length']) + data2['url']).encode(encoding='UTF-8')
                m = md5(destr).hexdigest()
                # --------------------------
                if m in Mlist:  # 重复重复跳过!~~
                    print('唯一表示符:' + m + ' ,已重复跳过!~~')
                    continue
                else:
                    Mlist.append(m)
                # --------------------------
                with open(picpath, 'wb') as file:
                    file.write(res.content)
        except requests.RequestException as err:
            Plist.put(data2)
            print('下载失败,稍后重试!' + err)
            continue


if __name__ == '__main__':
    # ------------------------------------------
    KeyWords = '白色轿车'  # 关键词
    Num = 2  # 爬取数量,50的倍数
    Tnum = 100  # 下载线程数
    # ------------------------------------------
    Name = 0  # 顺序命名
    Tlist = list()  # 子线程列表
    Mlist = list()  # 图片唯一表示符
    Plist = queue.Queue(Num * 100)  # 图片下载地址队列
    FilePath = './baidu/' + KeyWords + '/'  # 保存路径,使用相对路径
    Start = time.time()
    if not os.path.exists(FilePath):
        os.makedirs(FilePath)

    for xx in range(Num):
        dataList = getManyPages(KeyWords, xx * 50)
        for x in dataList:
            if 'thumbURL' in x:
                filename = 'baidu_' + KeyWords + '_' + str(Name) + '.jpg'
                data = {'filename': filename, 'url': x['thumbURL']}
                Plist.put(data)
                Name += 1
        print('当前已获取图片数:' + str(Plist.qsize()) + ' ' + str(xx))

    print('实际获取数量可能略少,实际获取数量:' + str(Plist.qsize()))
    print('开始启动下载线程')
    for x in range(Tnum):
        t = threading.Thread(target=Run)
        t.setDaemon(False)
        t.start()
        Tlist.append(t)
    print('所有线程启动完成,全力下载')
    while not Plist.empty() and threading.activeCount() > 1:
        time.sleep(1)
        print('当前线程总数:' + str(threading.activeCount()) + ' 剩余任务数:' + str(Plist.qsize()) + ' 执行耗时:' + str(
            round(time.time() - Start)) + 'S')
        for x in range(len(Tlist)):
            if not Tlist[x].isAlive() and not Plist.empty():
                print('其中一个线程死掉,马上重启')
                Tlist.pop(x)
                t = threading.Thread(target=Run)
                t.setDaemon(False)
                t.start()
                Tlist.append(t)
    print('全部下载完毕,下载需求数:' + str(Num * 50) + ',实际有效下载数:' + str(len(Mlist)) + ',执行总耗时:' + str(
        round(time.time() - Start)) + ' S')

搜狗图片代码:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import requests, queue, os, urllib3, time, threading
from hashlib import md5

urllib3.disable_warnings()  # 忽略ssl报错


def getManyPages(keyword, pages):
    url = 'https://pic.sogou.com/pics?query=' + keyword + '&start=' + str(pages) + '&reqType=ajax'  # 搜狗图片接口
    try:
        res = requests.get(url, timeout=(5, 20), verify=False)
        return res.json().get('items')
    except:
        print('json数据获取失败')
        return []


def Run():
    global Dnum
    while not Plist.empty():
        data2 = Plist.get()
        picpath = FilePath + data2['filename']
        try:
            res = requests.get(data2['url'], timeout=(5, 30), stream=True)
            if res.status_code == requests.codes.ok and int(res.headers['Content-Length']) > 0:
                destr = (str(res.headers['Content-Length']) + data2['url']).encode(encoding='UTF-8')
                m = md5(destr).hexdigest()
                # --------------------------
                if m in Mlist:  # 重复重复跳过!~~
                    print('唯一表示符:' + m + ' ,已重复跳过!~~')
                    continue
                else:
                    Mlist.append(m)
                # --------------------------
                with open(picpath, 'wb') as file:
                    file.write(res.content)
        except requests.RequestException as err:
            Plist.put(data2)
            print('下载失败,稍后重试!' + err)
            continue


if __name__ == '__main__':
    # ------------------------------------------
    KeyWords = '轿车'  # 关键词
    Num = 50  # 爬取数量,48的倍数
    Tnum = 100  # 下载线程数
    # ------------------------------------------
    Name = 0  # 顺序命名
    Tlist = list()  # 子线程列表
    Mlist = list()  # 图片唯一表示符
    Plist = queue.Queue(Num * 100)  # 图片下载地址队列
    FilePath = './sougou/' + KeyWords + '/'  # 保存路径,使用相对路径
    Start = time.time()
    if not os.path.exists(FilePath):
        os.makedirs(FilePath)

    for xx in range(Num):
        dataList = getManyPages(KeyWords, xx * 48)
        for x in dataList:
            if 'thumbUrl' in x:
                filename = 'sougou_' + KeyWords + '_' + str(Name) + '.jpg'
                data = {'filename': filename, 'url': x['thumbUrl']}
                Plist.put(data)
                Name += 1
        print('当前已获取图片数:' + str(Plist.qsize()) + ' ' + str(xx))

    print('实际获取数量可能略少,实际获取数量:' + str(Plist.qsize()))
    print('开始启动下载线程')
    for x in range(Tnum):
        t = threading.Thread(target=Run)
        t.setDaemon(False)
        t.start()
        Tlist.append(t)
    print('所有线程启动完成,全力下载')
    while not Plist.empty() and threading.activeCount() > 1:
        time.sleep(1)
        print('当前线程总数:' + str(threading.activeCount()) + ' 剩余任务数:' + str(Plist.qsize()) + ' 执行耗时:' + str(
            round(time.time() - Start)) + 'S')
        for x in range(len(Tlist)):
            if not Tlist[x].isAlive() and not Plist.empty():
                print('其中一个线程死掉,马上重启')
                Tlist.pop(x)
                t = threading.Thread(target=Run)
                t.setDaemon(False)
                t.start()
                Tlist.append(t)
    time.sleep(3)
    print('全部下载完毕,下载需求数:' + str(Num * 50) + ',实际有效下载数:' + str(len(Mlist)) + ',执行总耗时:' + str(
        round(time.time() - Start)) + ' S')

360图片代码:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import requests, queue, os, urllib3, time, threading
from hashlib import md5

urllib3.disable_warnings()  # 忽略ssl报错


def getManyPages(keyword, pages):
    url = 'http://image.so.com/j?q=' + keyword + '&pn=50&sn=' + str(pages)  # 百度图片网址,pn代表返回数量
    try:
        res = requests.get(url, timeout=(5, 20), verify=False)
        return res.json().get('list')
    except:
        print('百度图片json数据获取失败')
        return []


def Run():
    global Dnum
    while not Plist.empty():
        data2 = Plist.get()
        picpath = FilePath + data2['filename']
        try:
            res = requests.get(data2['url'], timeout=(5, 30), stream=True)
            if res.status_code == requests.codes.ok and int(res.headers['Content-Length']) > 0:
                destr = (str(res.headers['Content-Length']) + data2['url']).encode(encoding='UTF-8')
                m = md5(destr).hexdigest()
                # --------------------------
                if m in Mlist:  # 重复重复跳过!~~
                    print('唯一表示符:' + m + ' ,已重复跳过!~~')
                    continue
                else:
                    Mlist.append(m)
                # --------------------------
                with open(picpath, 'wb') as file:
                    file.write(res.content)
        except requests.RequestException as err:
            Plist.put(data2)
            print('下载失败,稍后重试!' + err)
            continue


if __name__ == '__main__':
    # ------------------------------------------
    KeyWords = '黑色轿车'  # 关键词
    Num = 40  # 爬取数量,50的倍数
    Tnum = 100  # 下载线程数
    # ------------------------------------------
    Name = 0  # 顺序命名
    Tlist = list()  # 子线程列表
    Mlist = list()  # 图片唯一表示符
    Plist = queue.Queue(Num * 100)  # 图片下载地址队列
    FilePath = './360/' + KeyWords + '/'  # 保存路径,使用相对路径
    Start = time.time()
    if not os.path.exists(FilePath):
        os.makedirs(FilePath)

    for xx in range(Num):
        dataList = getManyPages(KeyWords, xx * 50)
        for x in dataList:
            if 'thumb_bak' in x:
                filename = '360_' + KeyWords + '_' + str(Name) + '.jpg'
                data = {'filename': filename, 'url': x['thumb_bak']}
                Plist.put(data)
                Name += 1
        print('当前已获取图片数:' + str(Plist.qsize()) + ' ' + str(xx))

    print('实际获取数量可能略少,实际获取数量:' + str(Plist.qsize()))
    print('开始启动下载线程')
    for x in range(Tnum):
        t = threading.Thread(target=Run)
        t.setDaemon(False)
        t.start()
        Tlist.append(t)
    print('所有线程启动完成,全力下载')
    while not Plist.empty() and threading.activeCount() > 1:
        time.sleep(1)
        print('当前线程总数:' + str(threading.activeCount()) + ' 剩余任务数:' + str(Plist.qsize()) + ' 执行耗时:' + str(
            round(time.time() - Start)) + 'S')
        for x in range(len(Tlist)):
            if not Tlist[x].isAlive() and not Plist.empty():
                print('其中一个线程死掉,马上重启')
                Tlist.pop(x)
                t = threading.Thread(target=Run)
                t.setDaemon(False)
                t.start()
                Tlist.append(t)
    time.sleep(3)
    print('全部下载完毕,下载需求数:' + str(Num * 50) + ',实际有效下载数:' + str(len(Mlist)) + ',执行总耗时:' + str(
        round(time.time() - Start)) + ' S')
使用 Hugo 构建
主题 StackJimmy 设计