堆糖图片多线程抓取
丹丹
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import requests
import urllib.parse
import threading

#多线程锁
thread_lock = threading.BoundedSemaphore(value = 10)

def get_page(url):
#GET之后用utf-8解析
page = requests.get(url).content.decode("utf-8")
return page

def findall_in_page(page, start_part, end_part):
all_strings = []
end = 0
while page.find(start_part,end) != -1:
start = page.find(start_part, end)+len(start_part)
end = page.find(end_part, start)
string = page[start: end]
all_strings.append(string)
return all_strings

def pic_urls_find_part(pages):
pic_pics = []
for page in pages:
urls = findall_in_page(page, '"path":"', '"')
pic_pics.extend(urls)
return pic_pics

def pages_usr(label):
pages = []
url = 'https://www.duitang.com/napi/blog/list/by_search/?kw={}&start={}&limit=1000'
#将中文转成ASCII码
label = urllib.parse.quote(label)
#观察response可发现max=3600
for index in range(0,3600,100):
u = url.format(label,index)
page = get_page(u)
pages.append(page)
return pages

def download_pic(url, name):
req = requests.get(url)
path = 'test/杨幂'+str(name)+".jpg"
with open(path, 'wb+') as file:
file.write(req.content)
#将锁打开避免堵塞
thread_lock.release()

def main(label):
pages = pages_usr(label)
pic_urls = pic_urls_find_part(pages)
numbers = 0
for url in pic_urls:
numbers += 1
print("Now Downloading: {}".format(numbers))

#锁上,并开始执行
thread_lock.acquire()
t = threading.Thread(target=download_pic, args=(url, numbers))
t.start()

main('杨幂')
Now Downloading: 1
Now Downloading: 2
Now Downloading: 3
Now Downloading: 4
Now Downloading: 5
Now Downloading: 6
Now Downloading: 7
Now Downloading: 8
Now Downloading: 9
Now Downloading: 10

参考:

  1. 从零起步 系统入门Python爬虫工程师