程序師世界是廣大編程愛好者互助、分享、學習的平台,程序師世界有你更精彩!
首頁
編程語言
C語言|JAVA編程
Python編程
網頁編程
ASP編程|PHP編程
JSP編程
數據庫知識
MYSQL數據庫|SqlServer數據庫
Oracle數據庫|DB2數據庫
您现在的位置: 程式師世界 >> 編程語言 >  >> 更多編程語言 >> Python

Python crawler-2.01carhomespider --- exception handling of two-level page crawling

編輯:Python

Code display

#!/usr/bin/env python3.8.8
# -*- encoding: utf-8 -*-
''' @File : test.py @Time : 2022/02/15 08:19:32 @Author : Hot air balloon @Version : 1.0 @Contact : [email protected] '''
# C:/Users/shiya.liu/AppData/Local/Programs/Python/Python38/python.exe -m pip install
# here put the import lib
""" Auto home data capture - Two level pages Climb to the target : Car model 、 Situation mileage 、 Licensing time 、 Gear 、 displacement 、 The location of the vehicle first page :https://www.che168.com/china/a0_0msdgscncgpi1ltocsp1exx0/?pvareaid=102179#currengpostion The second page :https://www.che168.com/china/a0_0msdgscncgpi1ltocsp2exx0/?pvareaid=102179#currengpostion The third page :https://www.che168.com/china/a0_0msdgscncgpi1ltocsp3exx0/?pvareaid=102179#currengpostion """
from urllib import request
import re
import time
import random
import pymongo
class CarHomeSpider:
def __init__(self):
self.url = 'https://www.che168.com/china/a0_0msdgscncgpi1ltocsp{}exx0/?pvareaid=102179#currengpostion'
self.headers = {

'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:97.0) Gecko/20100101 Firefox/97.0'
}
self.i = 0
self.conn = pymongo.MongoClient(host='10.0.0.101', port=27017)
self.db = self.conn['carhomedb']
self.myset = self.db['carhomeset']
def get_html(self, url):
""" Function function 1: Get response content """
req = request.Request(url=url, headers=self.headers)
res = request.urlopen(req)
html = res.read().decode('gb18030', 'ignore')
# html = res.read().decode('utf-8','ignore')
return html
def re_func(self, regex, html):
""" Function function 2: Parse and extract data """
pattern = re.compile(regex, re.S)
r_list = pattern.findall(html)
return r_list
def parse_html(self, one_url):
""" Data grab function , Start with first level page parsing """
one_html = self.get_html(one_url)
one_regex = '<li class="cards-li list-photo-li.*?<a href="(.*?)".*?</li>'
href_list = self.re_func(one_regex, one_html)
for href in href_list:
if 'https://semnt.autohome.com.cn/' in href:
self.get_data(car_url=href)
else:
car_url = 'https://www.che168.com' + href
self.get_data(car_url=car_url)
# Grab 1 Information about the car , Random sleep 1-5 Second 
time.sleep(random.uniform(0, 5))
def get_data(self, car_url):
""" function : Grab 1 Details of the car """
two_html = self.get_html(car_url)
two_regex = '<div class="car-box">.*?<h3 class="car-brand-name">(.*?)</h3>.*?<h4>(.*?)</h4>.*?<h4>(.*?)</h4>.*?<h4>(.*?)</h4>.*?<h4>(.*?)</h4>.*?<span class="price" id="overlayPrice">(.*?)<b> ten thousand </b><i class="usedfont used-xiajiantou"></i>.*?'
car_lsit = self.re_func(two_regex, two_html)
try:
item = {
'name': car_lsit[0][0].strip(), 'km': car_lsit[0][1].strip(), 'time': car_lsit[0][2].strip(),
'type': car_lsit[0][3].split('/')[0].strip(), 'city': car_lsit[0][4].strip(),
'price': car_lsit[0][5].split(';')[1].strip()}
print(item)
self.myset.insert_one(item)
except IndexError as e:
print('--------->', e)
def run(self):
""" The entry function of the program """
for i in range(1, 5):
url = self.url.format(i)
self.parse_html(url)
if __name__ == '__main__':
spider = CarHomeSpider()
spider.run()

Result display

> db.carhomeset.find().pretty()
{
"_id" : ObjectId("620afae87d0c6fa12969fdad"),
"name" : " roewe RX5 2016 paragraph 20T Two drive automatic Internet smart version ",
"km" : "8.5 Thousands of kilometers ",
"time" : "2017 year 07 month ",
"type" : " Automatically ",
"city" : " Shenzhen ",
"price" : "7.60"
}
{
"_id" : ObjectId("620afaec7d0c6fa12969fdae"),
"name" : " jaguar XJ 2007 paragraph XJ8L 4.2 Royal extension ",
"km" : "16 Thousands of kilometers ",
"time" : "2007 year 05 month ",
"type" : " Automatically ",
"city" : " shenyang ",
"price" : "31.80"
}
{
"_id" : ObjectId("620afaf07d0c6fa12969fdaf"),
"name" : " lexus RX 2020 paragraph 450h 4WD classic countries V",
"km" : "1.5 Thousands of kilometers ",
"time" : "2021 year 01 month ",
"type" : " Automatically ",
"city" : " Hangzhou ",
"price" : "61.80"
}
{
"_id" : ObjectId("620afaff7d0c6fa12969fdb0"),
"name" : " Alison 2012 paragraph 2.4L VTi-S Noble edition ",
"km" : "18 Thousands of kilometers ",
"time" : "2014 year 05 month ",
"type" : " Automatically ",
"city" : " huizhou ",
"price" : "14.88"
}
{
"_id" : ObjectId("620afb037d0c6fa12969fdb1"),
"name" : " audi A4L 2020 paragraph 45 TFSI quattro Perfect dynamic ",
"km" : "0.7 Thousands of kilometers ",
"time" : "2020 year 04 month ",
"type" : " Automatically ",
"city" : " wuxi ",
"price" : "32.20"
}
{
"_id" : ObjectId("620afb067d0c6fa12969fdb2"),
"name" : " buick GL8 2014 paragraph 2.4L Classic Edition ",
"km" : "12 Thousands of kilometers ",
"time" : "2015 year 04 month ",
"type" : " Automatically ",
"city" : " shijiazhuang ",
"price" : "7.80"
}
{
"_id" : ObjectId("620afb077d0c6fa12969fdb3"),
"name" : " audi A6L 2020 paragraph 45 TFSI Refined and elegant ",
"km" : "1.6 Thousands of kilometers ",
"time" : "2020 year 09 month ",
"type" : " Automatically ",
"city" : " ningbo ",
"price" : "37.68"
}
{
"_id" : ObjectId("620afb0e7d0c6fa12969fdb4"),
"name" : " changan CS75 2014 paragraph 2.0L Manual luxury countries V",
"km" : "5.2 Thousands of kilometers ",
"time" : "2015 year 09 month ",
"type" : " Manual ",
"city" : " xingtai ",
"price" : "4.98"
}
{
"_id" : ObjectId("620afb0f7d0c6fa12969fdb5"),
"name" : "Panamera 2019 paragraph Panamera 2.9T",
"km" : "1.5 Thousands of kilometers ",
"time" : "2020 year 01 month ",
"type" : " Automatically ",
"city" : " Chongqing ",
"price" : "119.80"
}
{
"_id" : ObjectId("620afb127d0c6fa12969fdb6"),
"name" : " Mercedes S level 2019 paragraph S 350 L Noble type ",
"km" : "2 Thousands of kilometers ",
"time" : "2019 year 07 month ",
"type" : " Automatically ",
"city" : " taiyuan ",
"price" : "83.58"
}
{
"_id" : ObjectId("620afb157d0c6fa12969fdb7"),
"name" : " Alison 2019 paragraph 2.0L Hybrid supreme ",
"km" : "1.9 Thousands of kilometers ",
"time" : "2019 year 09 month ",
"type" : " Automatically ",
"city" : " Hefei ",
"price" : "28.88"
}
{
"_id" : ObjectId("620afb197d0c6fa12969fdb8"),
"name" : " buick GL8 2017 paragraph 25S Luxury countries V",
"km" : "5 Thousands of kilometers ",
"time" : "2018 year 06 month ",
"type" : " Automatically ",
"city" : " shijiazhuang ",
"price" : "18.60"
}
{
"_id" : ObjectId("620afb1d7d0c6fa12969fdb9"),
"name" : " highlander 2018 paragraph 2.0T Four wheel drive Deluxe 7 seat countries V",
"km" : "3.5 Thousands of kilometers ",
"time" : "2019 year 01 month ",
"type" : " Automatically ",
"city" : " shijiazhuang ",
"price" : "27.80"
}
{
"_id" : ObjectId("620afb1e7d0c6fa12969fdba"),
"name" : " BMW 5 system 2021 paragraph 525Li M Sport suit ",
"km" : "0.7 Thousands of kilometers ",
"time" : "2021 year 02 month ",
"type" : " Automatically ",
"city" : " shijiazhuang ",
"price" : "39.90"
}
{
"_id" : ObjectId("620afb227d0c6fa12969fdbb"),
"name" : " Alison 2016 paragraph 2.4L Deluxe edition ",
"km" : "7 Thousands of kilometers ",
"time" : "2018 year 07 month ",
"type" : " Automatically ",
"city" : " shijiazhuang ",
"price" : "24.90"
}
{
"_id" : ObjectId("620afb257d0c6fa12969fdbc"),
"name" : "Panamera 2019 paragraph Panamera 4 Executive extension 2.9T",
"km" : "5 Thousands of kilometers ",
"time" : "2020 year 03 month ",
"type" : " Automatically ",
"city" : " Chongqing ",
"price" : "123.80"
}
{
"_id" : ObjectId("620afb387d0c6fa12969fdbd"),
"name" : " buick GL8 2017 paragraph 25S Luxury countries V",
"km" : "7.6 Thousands of kilometers ",
"time" : "2017 year 09 month ",
"type" : " Automatically ",
"city" : " Xi'an ",
"price" : "17.90"
}
{
"_id" : ObjectId("620afb3c7d0c6fa12969fdbe"),
"name" : " lexus IS 2009 paragraph 300C",
"km" : "8.2 Thousands of kilometers ",
"time" : "2009 year 10 month ",
"type" : " Automatically ",
"city" : " Hangzhou ",
"price" : "16.90"
}
{
"_id" : ObjectId("620afb3e7d0c6fa12969fdbf"),
"name" : " audi A4L 2020 paragraph 40 TFSI quattro Luxury dynamic ",
"km" : "0.95 Thousands of kilometers ",
"time" : "2020 year 05 month ",
"type" : " Automatically ",
"city" : " Chongqing ",
"price" : "29.70"
}
{
"_id" : ObjectId("620afb427d0c6fa12969fdc0"),
"name" : " audi Q5L Sportback 2021 paragraph 45 TFSI Luxury ",
"km" : "0.29 Thousands of kilometers ",
"time" : "2021 year 01 month ",
"type" : " Automatically ",
"city" : " Chongqing ",
"price" : "40.60"
}
Type "it" for more
>

Reasons for appearance
Some pages have different structures , The regular matching cannot be found ,item It's empty ,list index out of range error


  1. 上一篇文章:
  2. 下一篇文章:
Copyright © 程式師世界 All Rights Reserved