Featured image of post 爬虫学习笔记3-api接口抓取

爬虫学习笔记3-api接口抓取

API

API(Application Programming Interface,应用程序编程接口)是一组规定和协议,它定义了不同软件应用或组件之间如何相互沟通和交互的方法。API可以视为一个中间件,它允许开发者访问和使用某些功能或数据,而无需了解背后的详细实现。

怎样使用

有些网站的会有对外的api接口供开发者调用,还有一些需要自己寻找

实操

图片网站 为例

获取请求地址

  1. 打开网站,可以看到有很多分类

  2. 选择一个分类,F12 打开控制台(开发者调用工具),点进Network,点击清空,点击刷新,查看请求的Url信息

获取返回数据

  1. 在请求的同时,会返回数据,也就是Response,点击Response,可以看到返回了一个Json

  2. 这个Json不太好看,需要使用Json格式化工具,我这里使用的是爬虫工具库,把Json粘贴进去,可以看到已经格式化了

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
	{
    "errno": 0,
    "msg": "",
    "data": {
        "total_count": 1996,
        "total_page": 200,
        "pageno": 0,
        "count": 10,
        "list": [
            {
                "id": "303197",
                "author": "Mr. Lonely",
                "user_image": "https://thirdwx.qlogo.cn/mmopen/vi_32/Pjribwpv4Uk0D6DIBxDsKiaj6oyaP8abYyIdyIX8cgWNZk9fgxss8dicJNRhic1ZE2ecODbicGgaZyAnjW0oxopvCpDD3Qmpu92KiaYaTicjdqMmus/132",
                "category": "4K专区",
                "tag": "城市夜景,灯光,古楼建筑,重庆洪崖洞",
                "url": "http://cdn-hsyq-static-bak.shanhutech.cn/bizhi/staticwp/201704/7dfab70819f358b0190ac5934f94acae.jpg",
                "status": "1",
                "live_open": false,
                "class_id": "36"
            },
            {
                "id": "303196",
                "author": "心若向阳",
                "user_image": "https://thirdwx.qlogo.cn/mmopen/vi_32/PiajxSqBRaEIoyZa5yQu5Uu0BXLWYTY5DeqWbQeHuibY0n1ibJQa17WwNfe1K8iblibRvMH9EEP9XgplkIYia9ZRLux0cmlLyCG8pSS30cUFpjFoCuyC2twt7VtA/132",
                "category": "4K专区",
                "tag": "卡通,天桥上的女孩,蓝天白云",
                "url": "http://cdn-hsyq-static-bak.shanhutech.cn/bizhi/staticwp/201704/993b932cb6d2df967c5922b35c117ddb.jpg",
                "status": "1",
                "live_open": false,
                "class_id": "36"
            },
            {
                "id": "303195",
                "author": "花儿",
                "user_image": "https://thirdwx.qlogo.cn/mmopen/vi_32/2DyBmGYZYz2nascllYH3tY1vXX3GibbXbmmkWvJjqK8sLiasTqFnqOQfaY504yqOg0GNX8QuXgsNxniaEhjqPEqIg/132",
                "category": "4K专区",
                "tag": "美食,面包",
                "url": "http://cdn-hsyq-static-bak.shanhutech.cn/bizhi/staticwp/201704/11c1e86af0e62f15513a1f85890f2641.jpg",
                "status": "1",
                "live_open": false,
                "class_id": "36"
            },
            {
                "id": "303194",
                "author": "旧城以西",
                "user_image": "https://thirdwx.qlogo.cn/mmopen/vi_32/Ux87Wj9W62K2aHMz1yia59ktccL4FTMXXIYu5viafFVCbaTt9hdQDveicG6LDg6H0LuO73VI86QueFQ2XXqPib1hLFSEeuKYRAr37Y6q48xNCLE/132",
                "category": "4K专区",
                "tag": "粉色鲜花",
                "url": "http://cdn-hsyq-static-bak.shanhutech.cn/bizhi/staticwp/201704/c10f78be2f780f7a5af5974dded2aaa4.jpg",
                "status": "1",
                "live_open": false,
                "class_id": "36"
            },
            {
                "id": "303193",
                "author": "xxx",
                "user_image": "https://thirdwx.qlogo.cn/mmopen/vi_32/zYicacMx5ibjo9RHN9u2sPFaCFyxSR7lnaNicqFJ1zgvMKvpveKluvDicmRtng8Qy7NmQPib2sREuGRXBHvuxIiaE1n6mH9hSxKuXGth7gNCZepOk/132",
                "category": "4K专区",
                "tag": "手绘绿色植物,梦幻",
                "url": "http://cdn-hsyq-static-bak.shanhutech.cn/bizhi/staticwp/201704/c6d954d05d64ccc1b92903f9e1dbeedf.jpg",
                "status": "1",
                "live_open": false,
                "class_id": "36"
            },
            {
                "id": "303192",
                "author": "绝版自我",
                "user_image": "https://thirdwx.qlogo.cn/mmopen/vi_32/V6RyaF0FIPmrBFZCLoGq7pkIZnSqJUr1YGC6aEjstLPMf87bhQ93do4Ba8fWE5aZYJXRfa4wPVZyz7sibiaYxlpMzEdt58sWticDsD2Q6MZs9s/132",
                "category": "4K专区",
                "tag": "美女和白猫",
                "url": "http://cdn-hsyq-static-bak.shanhutech.cn/bizhi/staticwp/201704/44641df1201422292333814a59f2828e.jpg",
                "status": "1",
                "live_open": false,
                "class_id": "36"
            },
            {
                "id": "303191",
                "author": "不发脾气",
                "user_image": "https://thirdwx.qlogo.cn/mmopen/vi_32/lLjNEmtiacOgicrFeJATGgAjL50EpBYV4hE8BbQUq4qlicERibkSPicPYSaOK44r1Pvgwz0icaW2xGg37wPfDabBwiaicicqXPhOPgc73gQpYTXCdkWs/132",
                "category": "4K专区",
                "tag": "短发美女",
                "url": "http://cdn-hsyq-static-bak.shanhutech.cn/bizhi/staticwp/201704/97cc9610292be10724ee7d78f8f50c80.jpg",
                "status": "1",
                "live_open": false,
                "class_id": "36"
            },
            {
                "id": "303190",
                "author": "ono",
                "user_image": "https://thirdwx.qlogo.cn/mmopen/vi_32/EKagrrV6YQV5MOfGKAeTM9piaKicvrFu6MbmoB1rS3mwIFjcQ6zaQtzVVBj3aynQoDtDU7ZQHWGogg6PRtdicFdWiaYSIvEnO5b7I5A2UN3etYE/132",
                "category": "4K专区",
                "tag": "时尚美女,写真",
                "url": "http://cdn-hsyq-static-bak.shanhutech.cn/bizhi/staticwp/201704/157600b64798a86c3d4180313832ace0.jpg",
                "status": "1",
                "live_open": false,
                "class_id": "36"
            },
            {
                "id": "303189",
                "author": "张一一",
                "user_image": "https://thirdwx.qlogo.cn/mmopen/vi_32/PiajxSqBRaEIiaarFR3aPOSj1d65tcQqlHCzJMdPIreZeRXkNnoGianicPDXRRuQSjFUwmv4V7TVMl4YyJBUYUiapvgk85IgwaibcjF5rGcyYqXrTVtUhmnZF35A/132",
                "category": "4K专区",
                "tag": "清纯女孩",
                "url": "http://cdn-hsyq-static-bak.shanhutech.cn/bizhi/staticwp/201704/5bdf2ee80f7cbef6220ad9acb3d888e9.jpg",
                "status": "1",
                "live_open": false,
                "class_id": "36"
            },
            {
                "id": "303188",
                "author": "晓燕",
                "user_image": "https://thirdwx.qlogo.cn/mmopen/vi_32/PiajxSqBRaELicjIBesHecRucVeNmDZMFOgcbdFpFn6YBsak7jV7cOhBDbpvibKsFhKxJ6iaqYdSAVmqtG3HeaZBadHQs8TeC0I9BMj8agffoPHTZEcVys2mHQ/132",
                "category": "4K专区",
                "tag": "孙俪,街拍",
                "url": "http://cdn-hsyq-static-bak.shanhutech.cn/bizhi/staticwp/201704/52336cbbd87555c03b457ed4e4ff4e85.jpg",
                "status": "1",
                "live_open": false,
                "class_id": "36"
            }
        ]
    },
    "processTime": 0.012
	}
	```


### 分析请求地址

1. 可以看到,我选择4K专区,请求的地址是https://www.nuantang.net/api.php?cid=36&start=0&count=10
2. 根据参数猜测
```text
cid=36.   可能是4K专区的分类id
start=0.   从第一页开始获取
count=10.   每次请求数量是10

分析返回数据

  1. 因为返回的数据是10组,所以只分析一组即可
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
{
"errno": 0,
"msg": "",
"data": {
	"total_count": 1996,
	"total_page": 200,
	"pageno": 0,
	"count": 10,
	"list": [
		{
			"id": "303197",
			"author": "Mr. Lonely",
			"user_image": "https://thirdwx.qlogo.cn/mmopen/vi_32/Pjribwpv4Uk0D6DIBxDsKiaj6oyaP8abYyIdyIX8cgWNZk9fgxss8dicJNRhic1ZE2ecODbicGgaZyAnjW0oxopvCpDD3Qmpu92KiaYaTicjdqMmus/132",
			"category": "4K专区",
			"tag": "城市夜景,灯光,古楼建筑,重庆洪崖洞",
			"url": "http://cdn-hsyq-static-bak.shanhutech.cn/bizhi/staticwp/201704/7dfab70819f358b0190ac5934f94acae.jpg",
			"status": "1",
			"live_open": false,
			"class_id": "36"
		}
	]
},
"processTime": 0.012
}
	```

2. 根据返回参数猜测
```text
公共数据:
"total_count": 1996    可能是4K专区的总数有1996张图片
"total_page": 200      总页数有200
"pageno": 0            当前页数
"count": 10            当前页图片数量


图片数据:
"id": ""                   图片id
"author": ""               图片作者或者上传者
"user_image": ""           可能是作者头像
"category": ""             图片分类
"tag": ""                  图片标签
"url": ""                  图片链接
"status": ""               图片状态,目前不知什么用
"live_open":               不知道什么意思
"class_id": ""             应该是4K专区的分类id,和上面猜测的一致

代码测试

一次获取所有数据(不推荐)

1
2
3
4
5
6
7
8
import requests  
  
  
url = "https://www.nuantang.net/api.php?cid=36&start=0&count=2000"  
  
resp = requests.get(url)  
  
print(resp.text)

根据测试,只要改变参数 count 的值,就可以直接把该分类下的所有图片获取到,但是一次获取大量数据,可能会被禁止访问,所以不推荐

分多次获取数据(推荐)

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
from time import sleep  
  
import requests  

pages = 20  # 分20页  
count = 100  # 每次请求100个  
  
for i in range(pages + 1):  
    url = f"https://www.nuantang.net/api.php?cid=36&start={i}&count={count}"  
  
    resp = requests.get(url)  
  
    print(resp.text)  
  
    print("=" * 20)  
  
    sleep(1)  # 暂停1秒

根据测试,每页请求100张图片,可以分20次返回,数据也很全面,并且每次暂停一秒,也不会因为过快请求被禁止访问

完整版代码

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import os  
from time import sleep  
from PIL import Image  
from io import BytesIO  
  
import requests  
  
  
def get_images():  
      
    pages = 20  # 分20页  
    count = 100  # 每次请求100个  
  
    for i in range(pages + 1):  
        url = f"https://www.nuantang.net/api.php?cid=36&start={i}&count={count}"  
        resp = requests.get(url)  
  
        img_datas = resp.json()  
  
        for img_data in img_datas["data"]["list"]:  
  
            img_url, save_path = get_img_info(img_data)  
  
            download_image(img_url, save_path)  
  
            sleep(1)  
  
        sleep(1)  # 暂停1秒  
  
  
def get_img_info(img_data):  
    img_url = img_data["url"]  
  
    category = img_data["category"]  
  
    tag = img_data["tag"]  
  
    img_name = tag.replace(",", "-")  
  
    # 判断目录是否存在  
    save_dir = f"./images/{category}"  
    if not os.path.exists(save_dir):  
        os.makedirs(save_dir)  
  
    save_path = f"./images/{category}/{img_name}"  
  
    return img_url, save_path  
  
  
def download_image(img_url, save_path):  
    try:  
  
        img_resp = requests.get(img_url)  
        # 获取图片文件格式  
        image = Image.open(BytesIO(img_resp.content))  
  
        img_save_path= f"{save_path}.{image.format}"  
  
        with open(img_save_path, 'wb') as f:  
            f.write(img_resp.content)  
  
    except Exception as e:  
        print(f"下载失败: {e}")  
  
  
if __name__ == '__main__':  
    get_images()

使用 Hugo 构建
主题 StackJimmy 设计