您现在的位置：程式師世界 >> 編程語言 > >> 更多編程語言 >> Python

利用python爬蟲篩選豆瓣好書

編輯：Python

豆瓣讀書可以通過標簽搜索相應類型的書籍，但是目前我只能看到1000本。
因此寫一個Douban_spider類，方便查詢相應類型的書籍。

def __init__(self, keyword):
self.keyword = keyword
self.url = "https://book.douban.com/tag/"+self.keyword
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3741.400 QQBrowser/10.5.3863.400"
}
def get_page(self, start):
params = {

"start": start*20,
"type": "T"
}
response = requests.get(self.url, params=params,
headers=self.headers).text
return response

首先是定義初始屬性，包括初始url、請求頭。keyword參數是你要搜索的書籍類型，比如小說。
然後是獲取網頁源代碼。通過觀察url的變化很容易能構造出start參數，也就是豆瓣書單的下一頁。
這裡還定義了一個初始的類屬性，也就是定義了一個空列表。在文章末尾有寫出。

def get_book(self, html):
doc = pq(html)
for items in doc("li.subject-item ").items():
book = items.find("h2").text()
message = items.find("div.pub").text()
score = items.find("span.rating_nums").text()
number = items.find("span.pl").text()[1:-1]
yield{

"book": book,
"message": message,
"score": score,
"number": number
}
def get_sort(self, html):
for j in self.get_book(html):
self.ls.append(j)
self.ls.sort(key=lambda x: x["score"], reverse=True)

這裡利用pyquery解析原網頁，獲取書名，出版社價格等信息，評分，評價人數。其中get_sort方法是將結果寫入txt類型的文檔時將書籍按評分排序。如果是寫入excel的csv類型的文件則不用該方法。直接將字典類型數據寫入txt需要用到json庫的dumps方法。將字典直接寫入csv文件需要用到csv庫，csv庫的詳細介紹可以參考相關資料。並且注意寫入csv文件時要加encoding和newline兩個參數，不然會出現亂碼。最後將代碼整合。

import time
import re
import requests
import json
import os
import random
import csv
from pyquery import PyQuery as pq
class Douban_spider(object):
ls = []#類屬性，為空列表，便於後續寫入txt文檔的排序
def __init__(self, keyword):
self.keyword = keyword
self.url = "https://book.douban.com/tag/"+self.keyword
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3741.400 QQBrowser/10.5.3863.400"
}
def get_page(self, start):
params = {

"start": start*20,
"type": "T"
}
response = requests.get(self.url, params=params,
headers=self.headers).text
return response
def get_book(self, html):
doc = pq(html)
for items in doc("li.subject-item ").items():
book = items.find("h2").text()
message = items.find("div.pub").text()
score = items.find("span.rating_nums").text()
number = items.find("span.pl").text()[1:-1]
yield{

"book": book,
"message": message,
"score": score,
"number": number
}
def get_sort(self, html):
for j in self.get_book(html):
self.ls.append(j)
self.ls.sort(key=lambda x: x["score"], reverse=True)
def write_book(self):
with open(self.keyword+".txt", "w", encoding="utf-8") as file:
file.write(json.dumps(self.ls, indent=2, ensure_ascii=False))
def write_csv(self, data):
with open(self.keyword+".csv", "a", encoding="utf-8-sig", newline='') as file:
fieldnames = ["book", "message",
"score", "number"]
writer = csv.DictWriter(file, fieldnames=fieldnames)
writer.writerow(data)
if __name__ == "__main__":
""" 將結果寫入txt文檔可以用這塊代碼 a=Douban_spider("小說") for i in range(50): html=a.get_page(i) a.get_sort(html) time.sleep(random.randint(3,5)) a.write_book()"""
#寫入csv文件用以下代碼
a = Douban_spider("小說")
for i in range(50):
html = a.get_page(i)
for data in a.get_book(html):
a.write_csv(data)
time.sleep(random.randint(3, 5))
#這裡設置延時防止被封