以前同学需要爬取酷安应用商店的各个APP的下载链接,然后进行分析,果断帮了这个忙。分析了一下,这个网站也没有反爬,正常爬取即可,这个网站不像小米应用商店,需要解析Html内容。
0x01 技术点
- 生产者消费者模型,加快两层爬虫爬取效率
re
、xpath
使用- 分析下载链接不能使用,最后发现下载的时候需要带cookie
0x02 如何使用
酷安的app分两个类别apk和game实例类的时候传入其中之一,以及爬取的页数即可。
0x03 源代码
1 | # -*- coding: UTF-8 -*- |
2 | __author__ = 'Joynice' |
3 | import requests |
4 | import queue |
5 | import threading |
6 | import re |
7 | from lxml import etree |
8 | import csv |
9 | from copy import deepcopy |
10 | |
11 | class KuAn(object): |
12 | |
13 | def __init__(self, type, page): |
14 | self.type = type |
15 | self.page = page |
16 | self.header = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'} |
17 | self.csv_header = ['应用名称', '下载链接'] |
18 | with open('{}.csv'.format(self.type), 'a+', newline='', encoding='utf-8-sig') as f: |
19 | csv_file = csv.writer(f) |
20 | csv_file.writerow(self.csv_header) |
21 | self.url = 'https://www.coolapk.com' |
22 | self.base_url = 'https://www.coolapk.com/{}'.format(type) |
23 | if type not in ['apk', 'game']: |
24 | raise ValueError('type参数不在范围内') |
25 | self.page_url_queue = queue.Queue() |
26 | self.detail_url_queue = queue.Queue() |
27 | self.save_queue = queue.Queue() |
28 | |
29 | def get_detail_url_fun(self): |
30 | while True: |
31 | page_url = self.page_url_queue.get() |
32 | req = requests.get(url=page_url,headers=self.header) |
33 | if req.status_code == 200: |
34 | req.encoding = req.apparent_encoding |
35 | html = etree.HTML(req.text) |
36 | path = html.xpath('//*[@class="app_left_list"]/a/@href') |
37 | for _ in path: |
38 | detail_url = self.url + _ |
39 | print('正在获取详情链接:',detail_url) |
40 | self.detail_url_queue.put(deepcopy(detail_url)) |
41 | self.page_url_queue.task_done() |
42 | |
43 | def get_download_url_fun(self): |
44 | while True: |
45 | detail_url = self.detail_url_queue.get() |
46 | req = requests.get(url=detail_url, headers=self.header) |
47 | if req.status_code == 200: |
48 | req.encoding = 'utf-8' |
49 | url_reg = '"(.*?)&from=click' |
50 | name_reg = '<p class="detail_app_title">(.*?)<' |
51 | download_url = re.findall(url_reg, req.text)[0] |
52 | name = re.findall(name_reg, req.text)[0] |
53 | data = {'name': name, 'url': download_url} |
54 | print('获取到数据:', data) |
55 | self.save_queue.put(data) |
56 | self.detail_url_queue.task_done() |
57 | |
58 | def save_data_fun(self): |
59 | while True: |
60 | data = self.save_queue.get() |
61 | name = data.get('name') |
62 | url = data.get('url') |
63 | with open('{}.csv'.format(self.type), 'a+', newline='', encoding='utf-8-sig') as f: |
64 | csv_file = csv.writer(f) |
65 | csv_file.writerow([name, url]) |
66 | self.save_queue.task_done() |
67 | |
68 | |
69 | def run(self): |
70 | for _ in range(1, self.page+1): |
71 | page_url = self.base_url + '?p={}'.format(_) |
72 | print('下发页面url', page_url) |
73 | self.page_url_queue.put(page_url) |
74 | |
75 | thread_list = [] |
76 | for _ in range(2): |
77 | get_detail_url = threading.Thread(target=self.get_detail_url_fun) |
78 | thread_list.append(get_detail_url) |
79 | |
80 | for _ in range(5): |
81 | get_download_url = threading.Thread(target=self.get_download_url_fun) |
82 | thread_list.append(get_download_url) |
83 | |
84 | for _ in range(2): |
85 | save_data = threading.Thread(target=self.save_data_fun) |
86 | thread_list.append(save_data) |
87 | |
88 | for t in thread_list: |
89 | t.setDaemon(True) |
90 | t.start() |
91 | |
92 | for q in [self.page_url_queue, self.detail_url_queue, self.save_queue]: |
93 | q.join() |
94 | |
95 | print('爬取完成,结束') |
96 | |
97 | if __name__ == '__main__': |
98 | |
99 | a= KuAn(type='apk', page=302).run() |