程序師世界是廣大編程愛好者互助、分享、學習的平台,程序師世界有你更精彩!
首頁
編程語言
C語言|JAVA編程
Python編程
網頁編程
ASP編程|PHP編程
JSP編程
數據庫知識
MYSQL數據庫|SqlServer數據庫
Oracle數據庫|DB2數據庫
您现在的位置: 程式師世界 >> 編程語言 >  >> 更多編程語言 >> Python

Vipshop product data collection of Python crawler series

編輯:Python

Python Vipshop product data collection of crawler series

If you have any questions, click here to contact us

Please scan the QR code below for wechat

The code is for learning and communication only , Do not use for illegal purposes

Go straight to the code

import requests
from queue import Queue
import configparser
import json
import sys
import execjs
import xlrd
import xlwt
import os
import re
import redis
from xlutils.copy import copy
import random
import threading
import time
from sign import getHeaders, getHeaders_
import traceback
from RedisUtils import RedisUtils
from urllib import parse
import urllib.parse
retry = 3
timeout = 20
kw = "vipproxy"
r = RedisUtils()
excelTitle = [" Commodity title ", " Product id ", " Applicable season ", " Applicable gender ", " style ", " Fabric ", " Size ", " Crossed price ", " Present price ", " Post coupon price "]
excelPath = os.getcwd() + "/data/"
if not os.path.exists(excelPath):
os.mkdir(excelPath)
cf = configparser.ConfigParser()
try:
cf.read(os.getcwd() + "/conf.ini", encoding="utf-8-sig")
except Exception as e:
print(e)
print(" Program directory does not exist conf.ini The configuration file ~")
sys.exit(0)
def getConf(sec, key):
try:
return cf.get(sec, key)
except Exception as e:
print(e)
print(" The following configuration is not available :" + sec + " - " + key)
sys.exit(0)
threadNums = 1
try:
threadNums = int(getConf("app-sys", "threadNums"))
if threadNums <= 0:
threadNums = 1
except Exception as e:
threadNums = 1
def getCurrentTime():
return str(time.strftime('%Y_%m_%d_%H_%M_%S', time.localtime(time.time())))
def getHtml(url, headers, proxies):
for i in range(retry):
try:
resp = requests.get(url, headers=headers, proxies=proxies, timeout=timeout)
return resp.content.decode("utf-8")
except Exception as e:
pass
def getProxy():
while True:
result = r.getLikeKeys(kw)
if result and isinstance(result, list) and len(result) > 0:
key = random.choice(result)
ip = {
"http": r.get(key)}
r.rm(key)
return ip
else:
print("ip The pool has no ip, Please check the proxy platform account ip Whether the margin is sufficient , or ip pool .exe / redis-server Whether to start !")
def getCookies():
while True:
try:
vipcookie = r.get("vipcookie")
if vipcookie and len(vipcookie) > 0:
return vipcookie
except Exception as e:
print(" We haven't got any cookie data , Please check redis-server Whether to start or keep .exe Whether to start !")
time.sleep(1)
def getCookieData(cookieStr, keys):
cookieData = {
}
try:
cookieArrs = cookieStr.replace(" ", "").split(";")
for i in range(len(cookieArrs)):
try:
cookieS = cookieArrs[i].split("=")
cookieKey = cookieS[0]
cookieVal = cookieS[1]
if cookieKey in keys:
cookieData[cookieKey] = cookieVal
except Exception as e:
pass
except Exception as e:
pass
return cookieData
class vipSpider(threading.Thread):
def __init__(self, brandQueue, index, *args, **kwargs):
super(vipSpider, self).__init__(*args, **kwargs)
self.brandQueue = brandQueue
self.index = index
self.proxies = getProxy()
def updateProxy(self):
self.proxies = getProxy()
print(" Threads %d to update ip %s " % (self.index, self.proxies))
def initExcel(self, path, title):
try:
f = xlwt.Workbook()
sheet1 = f.add_sheet(u'double', cell_overwrite_ok=True)
for i in range(0, len(title)):
sheet1.write(0, i, title[i])
f.save(path)
return True
except Exception as e:
return False
def writeExcel(self, data, path):
print("------------------------------------------")
print(data)
print("------------------------------------------")
try:
workbook = xlrd.open_workbook(path)
sheets = workbook.sheet_names()
worksheet = workbook.sheet_by_name(sheets[0])
rows_old = worksheet.nrows
new_workbook = copy(workbook)
new_worksheet = new_workbook.get_sheet(0)
for j in range(0, len(data)):
try:
new_worksheet.write(rows_old, j, str(data[j]))
except Exception as e:
continue
new_workbook.save(path)
return True
except Exception as e:
pass
return False
def getBrandStoreSn(self, brandName):
for i in range(retry):
headers = getHeaders()
url = "https://mapi.vip.com/vips-mobile/rest/shopping/pc/search/product/rank"
res = getHtml(url, headers, self.proxies)
try:
pat = re.compile('"brandStore":{"sn":"(\d+)"')
return str(re.findall(pat, res)[0])
except Exception as e:
self.updateProxy()
print(" brand :%s, Failed to obtain brand related information , Please check that the name is correct !" % brandName)
def getGoodsList(self, brandStoreSn, page):
for i in range(retry):
headers = getHeaders()
pageOffset = str((int(page) - 1) * 120)
url = "https://mapi.vip.com/vips-mobile/rest/shopping/pc/brandstore/product/rank"
res = getHtml(url, headers, self.proxies)
try:
pat = re.compile('.*?({.*})')
datas = json.loads(re.findall(pat, res)[0])
goodsList = datas['data']['productIds']
isLast = False if str(datas['data']['isLast']) == "0" else True
return goodsList, isLast
except Exception as e:
self.updateProxy()
def getPropertyVal(self, key, props):
if props and len(props) > 0:
for prop in props:
try:
if key == prop['name']:
return prop['value']
except Exception as e:
pass
def getGoodsStocks(self, pid):
for i in range(retry):
url = "https://stock.vip.com/detail"
headers = getHeaders_("https://stock.vip.com/detail", params)
res = getHtml(url, headers, self.proxies)
try:
pat = re.compile('.*?({.*})')
return json.loads(re.findall(pat, res)[0])['items']
except Exception as e:
self.updateProxy()
def getGoodsDetail(self, brandName, pid):
skus = self.getGoodsStocks(pid)
if skus is None or len(skus) == 0:
return
detailData = {
}
for i in range(retry):
headers = getHeaders(str(pid))
url = "https://mapi.vip.com/vips-mobile/rest/shopping/pc2/product/detail/v5"
res = getHtml(url, headers, self.proxies)
try:
datas = json.loads(res)['data']['product']
detailData['title'] = datas['title']
detailData['brandName'] = brandName
detailData['brandIdStr'] = datas['brandIdStr']
detailData['merchandiseSn'] = datas['merchandiseSn']
try:
v = self.getPropertyVal(" Applicable season ", datas['props'])
detailData['syjj'] = v if v else ""
except Exception as e:
detailData['syjj'] = ""
try:
v = self.getPropertyVal(" Applicable gender ", datas['props'])
detailData['syxb'] = v if v else ""
except Exception as e:
detailData['syxb'] = ""
try:
v = self.getPropertyVal(" style ", datas['props'])
detailData['ks'] = v if v else ""
except Exception as e:
detailData['ks'] = ""
try:
v = self.getPropertyVal(" Fabric ", datas['props'])
detailData['ml'] = v if v else ""
except Exception as e:
detailData['ml'] = ""
break
except Exception as e:
self.updateProxy()
price = self.getPrice(pid, detailData['brandIdStr'])
if price and len(price) > 0:
for sku in skus:
try:
if int(sku['stock']) > 0:
data = []
data.append(detailData['title'])
data.append(detailData['merchandiseSn'])
data.append(detailData['syjj'])
data.append(detailData['syxb'])
data.append(detailData['ks'])
data.append(detailData['ml'])
data.append(sku['name'])
data.append(price['saleMarketPrice'])
data.append(price['salePrice'])
data.append(price['finalPrice'])
self.writeExcel(data, self.excelPath)
except Exception as e:
pass
def run(self):
while True:
if self.brandQueue.empty():
break
brandName = self.brandQueue.get()
brandStoreSn = self.getBrandStoreSn(brandName)
if brandStoreSn and len(brandStoreSn) > 0:
self.excelPath = excelPath + "data_brand_" + str(brandName).replace(":", "").replace("\\", "").replace("/", "").replace("*", "").replace("?", "").replace("\"", "").replace("<", "").replace(">", "").replace("|", "") + "_t_" + getCurrentTime() + ".xls"
s = self.initExcel(self.excelPath, excelTitle)
if s:
page = 1
while True:
goodsList, isLast = self.getGoodsList(brandStoreSn, page)
if goodsList and len(goodsList) > 0:
for goods in goodsList:
try:
self.getGoodsDetail(brandName, goods)
except Exception as e:
pass
if isLast:
break
else:
page += 1
def getBrandQueue():
brandQueue = Queue(0)
try:
with open("brands.txt", "r", encoding="utf-8") as f:
lines = f.readlines()
for line in lines:
try:
line = line.replace("\r", "").replace("\n", "")
if line and len(line) > 0:
brandQueue.put(line)
except Exception as e:
pass
except Exception as e:
pass
return brandQueue
def main():
global threadNums
brandQueue = getBrandQueue()
threadNums = brandQueue.qsize() if threadNums > brandQueue.qsize() else threadNums
for i in range(threadNums):
try:
v = vipSpider(brandQueue, i)
v.start()
except Exception as e:
pass
if __name__ == '__main__':
main()

  1. 上一篇文章:
  2. 下一篇文章:
Copyright © 程式師世界 All Rights Reserved