您现在的位置：程式師世界 >> 編程語言 > >> 更多編程語言 >> Python

[Python crawler] crawl vipshop product information

編輯：Python

Steps for collecting vipshop product information ：

Brand acquisition ID And brand name ;
Get the total number of pages of the current brand product list ;
Get the product information in the product list on each page .

One 、 Brand acquisition ID And brand name

 def get_task(self,task_list=None):
'''
Access to task
:return:
'''
try:
print(" List of product types ：",task_list)
for task_ in task_list:
key_dit = {
"keyword":task_
}
url_str = urlencode(key_dit)
start_api = f'''https://mapi-rp.vip.com/vips-mobile/rest/shop/search/brand_store/get/v3?app_name=shop_wap&app_version=4.0&api_key=8cec5243ade04ed3a02c5972bcda0d3f&mobile_platform=2&source_app=yd_wap&warehouse=VIP_NH&fdc_area_id=104104101&province_id=104104&mars_cid=1584322664117_812f182347fe5848add8d04b91257af6&mobile_channel=mobiles-adp%3Ag1o71nr0%3A%3A%3A%3A%7C%7C&standby_id=nature&channel_id=1&isAZSort=1&gPlatform=WAP&mvip=true&_=1599117093&{url_str}'''
print(task_," ",start_api)
task_resp = self.sc.get_html(start_api)
task_json = task_resp.json()
if task_json['code']==1:
brand_list = task_json['data']['list']
# print(brand_list)
task_list = []
for brand_dict in brand_list:
# brand ID
brand_id = int(brand_dict['id'])
# Brand name
brand_name = brand_dict['name']
# Insert time
add_time = datetime.datetime.now()
task_list.append((task_,brand_id,brand_name,add_time,1))
if len(task_list):
sql = f'''insert into {self.task_tbl}(goods_type,brand_id,brand_name,add_time,is_state)
values(%s,%s,%s,%s,%s)'''
print(" Current task ：",task_list)
self.sc.store_data(sql,data_list=task_list)
except:
self.sc.collect_error()

Two 、 Get the total number of pages of the current brand product list

 def get_totalpage(self,id,brand_id,goods_type):
'''
Extract the total number of pages
'''
try:
key_dit = {
"keyword": goods_type
}
url_str = urlencode(key_dit)
total_api = f'''https://mapi-rp.vip.com/vips-mobile/rest/shopping/search/product/rank?app_name=shop_wap&app_version=4.0&api_key=8cec5243ade04ed3a02c5972bcda0d3f&mobile_platform=2&source_app=yd_wap&warehouse=VIP_NH&fdc_area_id=104104101&province_id=104104&mars_cid=1584322664117_812f182347fe5848add8d04b91257af6&mobile_channel=mobiles-adp%3Ag1o71nr0%3A%3A%3A%3A%7C%7C&standby_id=nature&{url_str}&brandStoreSns={brand_id}&sort=0&pageOffset=0&channelId=1&wapConsumer=A1&gPlatform=WAP&functions=bsBrands%2CfavNumLabel%2CtotalLabel&mvip=true&_=1599122080'''
print(" Mission ID：",id," Commodity type ：",goods_type," brand ID：",brand_id," Get total pages ：")
print(total_api)
html = self.sc.get_html(total_api)
if not html:
return 0
resp = html.json()
total_num = int(resp['data']['total'])
page_offset = int(resp['data']['batchSize'])
up_sql = f'''update {self.task_tbl} set total_num={total_num} where id={id}'''
print(" Update total items ：",up_sql)
self.sc.store_data(up_sql)
return (total_num,page_offset)
except:
self.sc.collect_error()

3、 ... and 、 Get the product information in the product list on each page

 def get_products(self,data_tuple):
try:
''' Extract product list '''
id,goods_type,brand_id,brand_name,total_num,add_time,is_state = data_tuple
total_num,page_offset = self.get_totalpage(id,brand_id,goods_type)
time.sleep(3.2)
for i in range(0,total_num,page_offset):
key_dit = {
"keyword":goods_type
}
url_str = urlencode(key_dit)
brand_api = f'''https://mapi-rp.vip.com/vips-mobile/rest/shopping/search/product/rank?app_name=shop_wap&app_version=4.0&api_key=8cec5243ade04ed3a02c5972bcda0d3f&mobile_platform=2&source_app=yd_wap&warehouse=VIP_NH&fdc_area_id=104104101&province_id=104104&mars_cid=1584322664117_812f182347fe5848add8d04b91257af6&mobile_channel=mobiles-adp%3Ag1o71nr0%3A%3A%3A%3A%7C%7C&standby_id=nature&{url_str}&brandStoreSns={brand_id}&sort=0&pageOffset={i}&channelId=1&wapConsumer=A1&gPlatform=WAP&functions=bsBrands%2CfavNumLabel%2CtotalLabel&mvip=true&_=1599122080'''
print(f" obtain {i} goods ",brand_api)
html = self.sc.get_html(brand_api)
if not html:
print(" Access to goods ID Page failed ")
continue
resp_json = html.json()
products_list = resp_json['data']['products']
if len(products_list):
insert_list = []
for products_dict in products_list:
# goods ID
goods_id = int(products_dict['pid'])
# Insert time
add_time = datetime.datetime.now()
insert_list.append((goods_type,brand_id,brand_name,goods_id,add_time,1))
insert_sql = f'''insert ignore into {self.data_tbl}(goods_type,brand_id,brand_name,
goods_id,add_time,is_state) values (%s,%s,%s,%s,%s,%s)'''
print(" Insert data into ","*"*50)
self.sc.store_data(insert_sql,data_list=insert_list)
time.sleep(random.uniform(1.7,4.2))
up_sql = f'''update {self.task_tbl} set is_state=0 where id={id}'''
print(f"{brand_id}——{brand_name} Finished crawling goods ID",up_sql)
self.sc.store_data(up_sql)
except:
self.sc.collect_error()

That's what I share , If there are any deficiencies, please point out , More communication , thank you ！

If you want to get more data or customize the crawler, please send me a private message .