程序師世界是廣大編程愛好者互助、分享、學習的平台,程序師世界有你更精彩!
首頁
編程語言
C語言|JAVA編程
Python編程
網頁編程
ASP編程|PHP編程
JSP編程
數據庫知識
MYSQL數據庫|SqlServer數據庫
Oracle數據庫|DB2數據庫
您现在的位置: 程式師世界 >> 編程語言 >  >> 更多編程語言 >> Python

Python - demo of various brand library Crawlers

編輯:Python

Solution

Source code :https://gitee.com/shentuzhigang/mini-project/tree/master/brand-crawler

Vipshop

import json
import openpyxl
import requests
allBrandList = []
r = requests.get(
'https://wxapi.appvipshop.com/vips-mobile/rest/shopping/category/index/get_tab/v1?api_key=ce29a51aa5c94a318755b2529dcb8e0b&_xcxid=1648525360552&app_name=shop_weixin_mina&client=wechat_mini_program&source_app=shop_weixin_mina&app_version=4.0&client_type=wap&format=json&mobile_platform=2&ver=2.0&standby_id=native&mobile_channel=nature&mars_cid=1648202533707_3e155b4df020055b73d22e80fb75afa6&warehouse=VIP_SH&fdc_area_id=103103101&province_id=103103&wap_consumer=B&t=1648525360&net=wifi&width=414&height=622&hierarchy_id=107&category_id=&category_filter=&sale_for=&client_from=wxsmall')
json1 = r.json()
data1 = json1['data']['data']['tabs']
for tab in data1:
print(tab['categoryId'])
r2 = requests.get(
'https://wxapi.appvipshop.com/vips-mobile/rest/shopping/category/index/get_tab_data/v1?api_key=ce29a51aa5c94a318755b2529dcb8e0b&_xcxid=1648525360675&app_name=shop_weixin_mina&client=wechat_mini_program&source_app=shop_weixin_mina&app_version=4.0&client_type=wap&format=json&mobile_platform=2&ver=2.0&standby_id=native&mobile_channel=nature&mars_cid=1648202533707_3e155b4df020055b73d22e80fb75afa6&warehouse=VIP_SH&fdc_area_id=103103101&province_id=103103&wap_consumer=B&t=1648525360&net=WIFI&width=750&height=500&pcmpWidth=510&hierarchy_id=107&category_id=' +
tab['categoryId'] + '&sale_for=')
json2 = r2.json()
data2 = json2['data']['data']
sectionList = data2['sectionList']
for section in sectionList:
if section['sectionType'] == 'category' and section['category']['name'] == ' Select brands ':
for brand in section['category']['children']:
B = dict(brand)
for b in brand:
if isinstance(brand[b],dict):
B = dict(B,**brand[b])
print(B)
allBrandList.append(B)
f = openpyxl.Workbook()
sheet1 = f.create_sheet('vip')
keys = dict()
i = 1
for jkey in range(len(allBrandList)):
for key, value in allBrandList[jkey].items():
if key in keys:
continue
sheet1.cell(row=1, column=i).value = key
keys[key] = i
i += 1
for jkey in range(len(allBrandList)):
jk = jkey + 2
cT = 0
for key, value in allBrandList[jkey].items():
cT += 1
sheet1.cell(row=jk, column=keys[key]).value = str(value)
f.save('vip.xlsx')

Get something

import json
import openpyxl
load_dict = ''
with open("dewu.json", 'r') as load_f:
load_dict = json.load(load_f)
series = load_dict['data']['list']
allBrandList = []
for l in series:
dc = dict()
for d in l:
dc = dict(dc, **l[d])
print(dc)
allBrandList.append(dc)
keys = dict()
i = 1
for jkey in range(len(allBrandList)):
for key, value in allBrandList[jkey].items():
if key in keys:
continue
keys[key] = i
i += 1
f = openpyxl.Workbook()
sheet1 = f.create_sheet('dewu')
for jkey in range(len(allBrandList)):
jk = jkey + 1
cT = 0
for key, value in allBrandList[jkey].items():
cT += 1
if cT == 0:
sheet1.cell(row=jk, column=keys[key]).value = key
else:
sheet1.cell(row=jk, column=keys[key]).value = str(value)
f.save('dewu.xlsx')

Cicada mother - Tiktok

Platform restrictions can only be accessed before 10000
Python2

# coding=utf-8
import json
import urllib2
f = open('data.json','w')
listAll = []
for i in range(1,100):
url = "https://api-service.chanmama.com/v2/home/brand/search?page="+ str(i) +"&category=&keyword=&fans_age=&fans_gender=-1&fans_province=&sort=day_volume&order_by=desc&size=100&has_aweme_sale=0&has_live_sale=0&interaction_inc_range=&amount_range="
print url
request = urllib2.Request(url)
# Imitate Firefox 
request.add_header("cookie", "***")
request.add_header("user-agent", "Mozilla/5.0")
response = urllib2.urlopen(request)
code = response.getcode()
content = response.read()
s = json.loads(content)
data = s['data']
list = data['list']
listAll.extend(list)
f.write(json.dumps(listAll))

Python3

# coding=utf-8
import json
import requests
f = open('data.json', 'w')
listAll = []
for i in range(1, 100):
url = "https://api-service.chanmama.com/v2/home/brand/search?page=" + str(
i) + "&category=&keyword=&fans_age=&fans_gender=-1&fans_province=&sort=day_volume&order_by=desc&size=100&has_aweme_sale=0&has_live_sale=0&interaction_inc_range=&amount_range="
print(url)
response = requests.get(url, headers={

"cookie": "***",
# Imitate Firefox 
"user-agent": "Mozilla/5.0"
})
code = response.status_code
content = response.json()
data = content['data']
list = data['list']
listAll.extend(list)
# f.write(json.dumps(listAll))

Love inventory

Version of a

import requests
import json
cookie = '***'
headers = {

"cookie": cookie,
# Imitate Firefox 
"user-agent": "Mozilla/5.0"
}
response = requests.get(
'https://h5-shop.aikucun.com/api/commodity/subscribe/v2/queryActivityTagV2/WWDLOwYqYqr?shopBizCode=WWDLOwYqYqr',
headers=headers)
tags = response.json()
tagNos = []
allBrandList = []
for tag in tags['data']:
tagNos.append(tag['activityTagNo'])
for status in range(1, 3):
print('tag:' + tag['activityTagNo'] + ',status:' + str(status))
res = requests.get(
'https://h5-shop.aikucun.com/api/commodity/subscribe/v2/h5/querybrandList?shopBizCode=WWDLOwYqYqr',
params={

'tagNo': tag['activityTagNo'],
'status': status
},
headers=headers)
json1 = res.json()
if 'data' in json1:
data = json1['data']
brandLists = data['brandList']
for brandList in brandLists:
blist = brandList['brandList']
for b in blist:
allBrandList.append(b)
print(dict(b, **b['brandExtend']))
f = open('aikucun.json', 'w', encoding='utf-8')
f.write(json.dumps(allBrandList).encode('utf-8').decode('utf-8'))

Version 2
Save to xlsx
Solve the mess

import re
import requests
import openpyxl
cookie = ''
headers = {

"cookie": cookie,
# Imitate Firefox 
"user-agent": "Mozilla/5.0"
}
response = requests.get(
'https://h5-shop.aikucun.com/api/commodity/subscribe/v2/queryActivityTagV2/WWDLOwYqYqr?shopBizCode=WWDLOwYqYqr',
headers=headers)
tags = response.json()
tagNos = []
allBrandList = []
for tag in tags['data']:
tagNos.append(tag['activityTagNo'])
for status in range(1, 3):
print('tag:' + tag['activityTagNo'] + ',status:' + str(status))
res = requests.get(
'https://h5-shop.aikucun.com/api/commodity/subscribe/v2/h5/querybrandList?shopBizCode=WWDLOwYqYqr',
params={

'tagNo': tag['activityTagNo'],
'status': status
},
headers=headers)
json1 = res.json()
if 'data' in json1:
data = json1['data']
brandLists = data['brandList']
for brandList in brandLists:
blist = brandList['brandList']
for b in blist:
print(dict(b, **b['brandExtend']))
if 'pcodelen' in b and b['pcodelen'] != '':
str0 = r'u"\u{0}'.format(r'\u'.join(re.findall(r'.{4}', str(b['pcodelen'])))) + '"'
print(str0)
str1 = str(eval(str0))
b['pinpaiming0'] = str1 + str(b['pinpaiming'])[len(str1):]
print(b['pinpaiming0'])
allBrandList.append(b)
print(sorted(dict(b, **b['brandExtend']).items(), key=lambda d: d[0]))
# f = open('aikucun.json', 'w', encoding='utf-8')
# f.write(json.dumps(allBrandList).encode('utf-8').decode('utf-8'))
keys = dict()
i = 1
for jkey in range(len(allBrandList)):
for key, value in allBrandList[jkey].items():
if key in keys:
continue
keys[key] = i
i += 1
f = openpyxl.Workbook()
sheet1 = f.create_sheet('aikucun')
for jkey in range(len(allBrandList)):
jk = jkey + 1
cT = 0
for key, value in allBrandList[jkey].items():
cT += 1
if cT == 0:
sheet1.cell(row=jk, column=keys[key]).value = key
else:
sheet1.cell(row=jk, column=keys[key]).value = str(value)
f.save('aikucun.xlsx')

Good wardrobe

import requests
import json
headers = {

# Imitate Firefox 
"user-agent": "Mozilla/5.0"
}
allBrandList = []
for i in range(-300, 600):
for ty in [1, 5]:
response = requests.post('https://www.webuy.ai/sesame/hyk/shopCategory/brand/detail',
headers=headers,
json={

"exhibitionParkType": ty,
"categoryId": i,
"shopId": 3572,
"pageSize": 1000,
"pageNo": 1,
"isPageQuery": False
})
print(response.json())
json1 = response.json()
entry = json1['entry']
for b in entry:
print(b)
allBrandList.append(b)
f = open('webuy.json', 'w', encoding='utf-8')
f.write(json.dumps(allBrandList).encode('utf-8').decode('utf-8'))

Fashion brand network

import requests
from bs4 import BeautifulSoup
import openpyxl
from openpyxl.drawing.image import Image
from PIL import Image as PILImage
from io import BytesIO
from concurrent.futures import ThreadPoolExecutor
import threading
import time
f = openpyxl.Workbook()
sheet1 = f.create_sheet('chinasspp')
headers = [' The brand name ', ' Industry categories ', ' Corporate name ', ' contact number ', ' Company fax ', ' Official website ', ' Contact address ', ' Online Service ']
for index, name in enumerate(headers):
sheet1.cell(row=1, column=index + 1).value = name
count = 1
def parseDetail(no, link):
response = requests.get(link)
response.encoding = "gbk"
soup = BeautifulSoup(response.text, 'lxml')
print('no' + str(no))
for item in soup.select_one("#brand_info_ctl00_blink").select('li'):
key = item.text.split(':')[0]
value = item.text.split(':')[1]
# print(item)
# print(key + ':' + value)
# print(headers.index(key))
sheet1.cell(row=no, column=headers.index(key) + 1).value = value.encode('utf-8').decode('utf-8')
if key == ' contact number ':
url1 = 'http://www.chinasspp.com' + item.select_one('img').attrs.get('src')
img1 = PILImage.open(BytesIO(requests.get(url1).content))
sheet1.add_image(Image(img1), chr(ord("A") + headers.index(key)) + str(no))
if key == ' Company fax ':
url2 = 'http://www.chinasspp.com' + item.select_one('img').attrs.get('src')
img2 = PILImage.open(BytesIO(requests.get(url2).content))
sheet1.add_image(Image(img2), chr(ord("A") + headers.index(key)) + str(no))
with ThreadPoolExecutor(max_workers=16) as pool:
for i in range(1, 516):
print('Page ' + str(i))
response = requests.get("http://www.chinasspp.com/brand/%E5%A5%B3%E8%A3%85%E5%93%81%E7%89%8C/" + str(i) + "/")
soup = BeautifulSoup(response.text, 'lxml')
soup.select(".brand")
for brand in soup.select(".brand"):
link = brand.select_one('.logo').attrs.get('href')
count += 1
th = pool.submit(parseDetail, count, link)
pool.shutdown(wait=True)
f.save('chinasspp.xlsx')

  1. 上一篇文章:
  2. 下一篇文章:
Copyright © 程式師世界 All Rights Reserved