酷安应用商店爬虫

以前同学需要爬取酷安应用商店的各个APP的下载链接,然后进行分析,果断帮了这个忙。分析了一下,这个网站也没有反爬,正常爬取即可,这个网站不像小米应用商店,需要解析Html内容。

0x01 技术点

  • 生产者消费者模型,加快两层爬虫爬取效率
  • rexpath使用
  • 分析下载链接不能使用,最后发现下载的时候需要带cookie

0x02 如何使用

酷安的app分两个类别apk和game实例类的时候传入其中之一,以及爬取的页数即可。

0x03 源代码

1
# -*- coding: UTF-8 -*-
2
__author__ = 'Joynice'
3
import requests
4
import queue
5
import threading
6
import re
7
from lxml import etree
8
import csv
9
from copy import deepcopy
10
11
class KuAn(object):
12
13
    def __init__(self, type, page):
14
        self.type = type
15
        self.page = page
16
        self.header = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'}
17
        self.csv_header = ['应用名称', '下载链接']
18
        with open('{}.csv'.format(self.type), 'a+', newline='', encoding='utf-8-sig') as f:
19
            csv_file = csv.writer(f)
20
            csv_file.writerow(self.csv_header)
21
        self.url = 'https://www.coolapk.com'
22
        self.base_url = 'https://www.coolapk.com/{}'.format(type)
23
        if type not in ['apk', 'game']:
24
            raise ValueError('type参数不在范围内')
25
        self.page_url_queue = queue.Queue()
26
        self.detail_url_queue = queue.Queue()
27
        self.save_queue = queue.Queue()
28
29
    def get_detail_url_fun(self):
30
        while True:
31
            page_url = self.page_url_queue.get()
32
            req = requests.get(url=page_url,headers=self.header)
33
            if req.status_code == 200:
34
                req.encoding = req.apparent_encoding
35
                html = etree.HTML(req.text)
36
                path = html.xpath('//*[@class="app_left_list"]/a/@href')
37
                for _ in path:
38
                    detail_url = self.url + _
39
                    print('正在获取详情链接:',detail_url)
40
                    self.detail_url_queue.put(deepcopy(detail_url))
41
            self.page_url_queue.task_done()
42
43
    def get_download_url_fun(self):
44
        while True:
45
            detail_url = self.detail_url_queue.get()
46
            req = requests.get(url=detail_url, headers=self.header)
47
            if req.status_code == 200:
48
                req.encoding = 'utf-8'
49
                url_reg = '"(.*?)&from=click'
50
                name_reg = '<p class="detail_app_title">(.*?)<'
51
                download_url = re.findall(url_reg, req.text)[0]
52
                name = re.findall(name_reg, req.text)[0]
53
                data = {'name': name, 'url': download_url}
54
                print('获取到数据:', data)
55
                self.save_queue.put(data)
56
            self.detail_url_queue.task_done()
57
58
    def save_data_fun(self):
59
        while True:
60
            data = self.save_queue.get()
61
            name = data.get('name')
62
            url = data.get('url')
63
            with open('{}.csv'.format(self.type), 'a+', newline='', encoding='utf-8-sig') as f:
64
                csv_file = csv.writer(f)
65
                csv_file.writerow([name, url])
66
            self.save_queue.task_done()
67
68
69
    def run(self):
70
        for _ in range(1, self.page+1):
71
            page_url = self.base_url + '?p={}'.format(_)
72
            print('下发页面url', page_url)
73
            self.page_url_queue.put(page_url)
74
75
        thread_list = []
76
        for _ in range(2):
77
            get_detail_url = threading.Thread(target=self.get_detail_url_fun)
78
            thread_list.append(get_detail_url)
79
80
        for _ in range(5):
81
            get_download_url = threading.Thread(target=self.get_download_url_fun)
82
            thread_list.append(get_download_url)
83
84
        for _ in range(2):
85
            save_data = threading.Thread(target=self.save_data_fun)
86
            thread_list.append(save_data)
87
88
        for t in thread_list:
89
            t.setDaemon(True)
90
            t.start()
91
92
        for q in [self.page_url_queue, self.detail_url_queue, self.save_queue]:
93
            q.join()
94
95
        print('爬取完成,结束')
96
97
if __name__ == '__main__':
98
99
    a= KuAn(type='apk', page=302).run()