python 通过requests爬取shopify collections商品

简单使用requests爬取shopify商品信息

需要自行分析商品链接、标题、价格、描述、图片的选择器

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import csv
import os
import re
import time

import bs4
import requests

# collections地址
url_pro = 'https://www.naluboutique.com/collections/all'

# 分析xpath
product_link_select = '.grid-product__content .grid-product__link'
product_title_select = '.product-single__title'
product_price_select = '.product__price'
product_desc_select = '.product-single__description'
product_img_select = '.product__photos img'

'''采集商品集合开始'''


def crawl_collections(url):
while True:
try:
res = requests.get(url, timeout=30)
res.encoding = res.apparent_encoding
print('请求', url, '状态', res.status_code)
# 如果返回状态不是200,则抛出异常
res.raise_for_status()
break
except:
timeout = 3
print('链接失败,等待', timeout, '秒重试')
time.sleep(timeout)
print('')
print('重新链接中')
print('请求成功,开始获取商品链接')
# html.parser 指定解析器
noStarchSoup = bs4.BeautifulSoup(res.text, 'html.parser')
url = noStarchSoup.select(product_link_select)
for i in range(len(url)):
imgurl = domainWithProtocol + url[i].get('href')
print('获取产品url')
# 调用采集内容方法
crawl_product(imgurl)
print('\n')


'''采集商品url结束'''

'''采集商品内容开始'''


def crawl_product(url):
print('开始请求产品页面', url)
while True:
try:
res = requests.get(url, timeout=30)
res.encoding = res.apparent_encoding
print('成功请求商品页面:', res.status_code)
res.raise_for_status() # 如果下载发生问题,就抛出异常
break
except:
print('请求商品页面', url, '失败,重新链接')

noStarchSoup = bs4.BeautifulSoup(res.text, 'html.parser')
name = noStarchSoup.select(product_title_select)
name = name[0].getText()
price = noStarchSoup.select(product_price_select)
price = price[0].getText()
price = re.sub(' ', '', price)
price = re.sub('\n', '', price)
des = noStarchSoup.select(product_desc_select)
des = des[0].getText()
img = noStarchSoup.select(product_img_select)
l = []
if img != []:
for i in range(len(img)):
imgurl = img[i].get('src')
if imgurl is None:
imgurl = img[i].get('data-src')
if imgurl.__contains__('{width}'):
continue
l.append('https:' + imgurl)
l = '\r\n'.join(l)
fileHeader = ['标题', '产品url', '价格', '描述', '图片']
file = [name, url, price, des, l]
# 文件存储的地方,文件夹需要事先创建,并指定文件的格式为utf-8
while True:
try:
csvFile = open(csv_name, 'a+', encoding='utf-8')
break
except Exception as e:
print(e)
print(csv_name + '文件写入失败,重试中。。。。。')
time.sleep(5)
size = os.path.getsize(csv_name) # 判断文件大小,如果文件大于0则表示文件有内
writer = csv.writer(csvFile)
if size == 0:
writer.writerow(fileHeader)
writer.writerow(file)
csvFile.close()
else:
writer.writerow(file)
csvFile.close()
print('采集成功!')


if __name__ == '__main__':
protocol = 'https://'
domain = re.match('https://(.*)/collections', url_pro).group(1)
domainWithProtocol = protocol + domain
csv_name = domain + time.strftime('_%Y-%m-%d-%H-%M-%S', time.localtime(time.time())) + '.csv'
next = ['1']
n = 1
while next != []:
url = url_pro + '?sort_by=best-selling&page=' + str(n)
# 调用采集列表方法
crawl_collections(url)
print('成功采集', n, '页')
n = n + 1
res = requests.get(url)
res.raise_for_status()
noStarchSoup = bs4.BeautifulSoup(res.text, 'html.parser')
next = noStarchSoup.select('.next')
print('全部采集完毕!!')

效果:

image-20220801152718166


python 通过requests爬取shopify collections商品
https://cason.work/2022/08/01/python-通过requests爬取shopify-collections商品/
作者
Cason Mo
发布于
2022年8月1日
许可协议