您现在的位置：程式師世界 >> 編程語言 > >> 更多編程語言 >> Python

【Python爬蟲】urllib庫——尚硅谷

編輯：Python

1.urllib庫的使用

urllib.request.rulopen()模擬浏覽器向服務器發送請求

response 服務器返回的數據

response的數據類型是HttpResponse

字節-->字符串

解碼decode

字符串-->字節

編碼encode


#使用urllib來獲取百度首頁的源碼
import urllib.request
#1.定義一個url 就是要訪問的地址
url = 'http://www.baidu.com'
#2.模擬浏覽器向服務器發送請求（需要聯網） response=響應
response=urllib.request.urlopen(url)
#3.獲取響應中的頁面源碼 content內容的意思
#read方法 返回的是字節式的二進制數據
#我們要將二進制數據轉換為字符串
#二進制-->字符串 解碼 decode('編碼的格式')
content=response.read().decode('utf-8') #這一步非常重要
#4.打印數據
print(content)

read() 字節形式讀取二進制擴展：rede(5)返回前幾個字節

readline() 讀取一行

readlines() 一行一行讀取直至結束

getcode() 獲取狀態嗎

geturl() 獲取url

getheaders() 獲取headers

urllib.request.urlretrieve()

import urllib.request
url='http://www.baidu.com'
#模擬浏覽器向服務器發送請求
response=urllib.request.urlopen(url)
#一個類型和六個方法
# response 是HTTPResponse的類型
print(type(response))
#按照一個字節一個字節去讀
# content=response.read()
# print(content)
#返回多少個字節
# content = response.read(5) #只讀五個字節
# print(content)
# #只讀取一行
# content = response.readline()
# print(content)
# #讀取多行
# content = response.readlines()
# print(content)
# #獲取狀態碼 如果是200了，那麼就證明我們的邏輯沒有錯
# print(response.getcode())
#
# #返回url地址
# print(response.geturl())
#
# #獲取是一個狀態信息
# print(response.getheaders())
#一個類型 HTTTPResponse
#六個方法 read readline readlines getcode geturl getheaders

請求頁面

請求圖片

請求視頻

import urllib.request
#下載網頁
url_page='http://www.baidu.com'
# url代表的是下載的路徑 filename文件的名字
# 在python中 可以變量的名字，也可以直接寫值
# urllib.request.urlretrieve(url_page,'baidu.html')
#下載圖片
# url_img='https://image.so.com/view?src=imageonebox&q=LISA&obx_type=360pic_new_strong&correct=LISA&ancestor=list&cmsid=9cc840c2046491a3c774ae2f7fa315f4&cmras=0&cn=0&gn=0&kn=50&crn=0&bxn=0&fsn=110&cuben=0&pornn=0&manun=50&adstar=0&clw=241#id=8bc223c645b45b276ae559e1ab81c72f&currsn=0&ps=129&pc=129'
# urllib.request.urlretrieve(url=url_img, filename='lisa.png')
#下載視頻

2.請求對象的定制

爬蟲是模擬浏覽器向服務器發送請求的過程，定制對象是一種反爬蟲的手段

需要使用headers定制操作系統


import urllib.request
url='https://www.baidu.com'
#url組成
# https://www.baidu.com/s?wd=周傑倫
#http/https www.baidu.com 80/443 s wd = 周傑倫 #
# 協議 主機 端口號 路徑 參數 錨點
# http 80
# https 443
# mysql 3306
# oracle 1521
# redis 6379
# mongodb 27017
#headers字典
headers={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'
}
# 因為urlopen方法中不能存儲字典，所以headers不能傳遞進去
# 請求對象的定制
# 注意 因為參數順序的問題，不能直接寫url 和 headers 中間還有data 所以我們需要關鍵詞傳參
request=urllib.request.Request(url=url,headers=headers)
response=urllib.request.urlopen(request)
content=response.read().decode('utf8')
print(content)
# response = urllib.request.urlopen(url)
#
# content = response.read().decode('utf8')
#
# print(content)

1.get請求的quote方法（把中文轉換成unicode編碼）


#需求 獲取https://www.baidu.com/s?wd=周傑倫的源碼
#https://www.baidu.com/s?wd=%E5%91%A8%E6%9D%B0%E4%BC%A6
import urllib.request
import urllib.parse
url='https://www.baidu.com/s?wd='
#請求對象的定制是為了解決反爬的一種手段
headers={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'
}
#要將周傑倫三個字變成unicode編碼
#我們需要依賴於urllib.parse
name=urllib.parse.quote('周傑倫')
print(name)
url=url+name
print(url)
#請求對象的定制
request=urllib.request.Request(url=url,headers=headers)
#模擬浏覽器向服務器發送請求
response=urllib.request.urlopen(request)
#獲取響應內容
content=response.read().decode('utf-8')
#打印數據
print(content)

2.get請求的urlencode方法

適用於多個參數的情況之下，直接定義為一個字典形式


#urlencode應用場景：多個參數的時候
#https://www.baidu.com/s?wd=周傑倫&sex=男、
import urllib.parse
import urllib.request
#
# data={
# 'wd':'周傑倫',
# 'sex':'男',
# 'location':'中國台灣省'
# }
# a=urllib.parse.urlencode(data)
# print(a)
base_url='https://www.baidu.com/s?'
data={
'wd':'周傑倫',
'sex':'男',
'location':'中國台灣省'
}
new_data=urllib.parse.urlencode(data)
url=base_url+new_data
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'
}
#請求對象的定制
request=urllib.request.Request(url=url,headers=headers)
#模擬浏覽器向服務器發送請求
response=urllib.request.urlopen(request)
#獲取網頁源碼的數據
content=response.read().decode('utf-8')
#打印數據
print(content)

3.post請求百度翻譯

注意：post請求的參數 必須進行編碼

POST的請求參數是不會拼接在url後面的，而是需要放在請求對象定制的參數中


#post請求
import urllib.request
import urllib.parse
url='https://fanyi.baidu.com/sug'
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'
}
data={
'kw':'spider'
}
#post請求的參數 必須進行編碼
data=urllib.parse.urlencode(data).encode('utf-8')
print(data)
#POST的請求參數是不會拼接在url後面的，而是需要放在請求對象定制的參數中
#post請求的參數，必須要進行編碼
request=urllib.request.Request(url=url,data=data,headers=headers)
print(request)
#模擬浏覽器向服務器發送請求
response=urllib.request.urlopen(request)
#獲取響應的數據
print(response)
#獲取響應的數據
content=response.read().decode('utf-8')
print(content)
#字符串轉換成json對象
import json
obj=json.loads(content)
print(obj)

4.ajax的get請求豆瓣電影的第一頁

保存信息到文件之中一共有兩種方式

fp=open('douban.json','w',encoding='utf-8')
fp.write(content)
fp.close()

with open('douban1.json','w',encoding='utf-8') as fp:
fp.write(content)
fp.close()


#get請求
#獲取豆瓣電影的第一頁的數據 並且保存起來
import urllib.request
url='https://movie.douban.com/j/chart/top_list?type=5&interval_id=100%3A90&action=&start=0&limit=20'
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'
}
#1.請求對象定制
request=urllib.request.Request(url=url,headers=headers)
#2.獲取響應的數據
response=urllib.request.urlopen(request)
content=response.read().decode('utf-8')
print(content)
#3.數據下載到本地
#open方法默認情況使用的是gbk的編碼 如果我們要想保存漢字 那麼需要在open方法中指定編碼格式utf-8
# encoding='utf-8'
# fp=open('douban.json','w',encoding='utf-8')
# fp.write(content)
# fp.close()
# import json
# obj=json.loads(content)
# print(obj)
with open('douban1.json','w',encoding='utf-8') as fp:
fp.write(content)
fp.close()

5.Ajax的get請求豆瓣電影的前10頁

需要用到函數把其分段進行循環輸出

import urllib.request
import urllib.parse
#https://movie.douban.com/j/chart/top_list?type=5&interval_id=100%3A90&action=&start=0&limit=20
#https://movie.douban.com/j/chart/top_list?type=5&interval_id=100%3A90&action=&start=20&limit=20
#https://movie.douban.com/j/chart/top_list?type=5&interval_id=100%3A90&action=&start=40&limit=20
#page 1 2 3 4
#start 0 20 40 60
#start (page-1)*20
#下載豆瓣電影前10頁的數據
#1.請求對象的定制
#2.獲取響應的數據
#3.下載數據
def create_request(page):
base_url='https://movie.douban.com/j/chart/top_list?type=5&interval_id=100%3A90&action=&'
data={
'start':(page-1)*20,
'limit':20
}
data=urllib.parse.urlencode(data)
url=base_url+data
print(url)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'
}
request=urllib.request.Request(url=url,headers=headers)
return request
def get_content(request):
response=urllib.request.urlopen(request)
content=response.read().decode('utf-8')
return content
def down_load(page,content):
with open('douban_'+str(page)+'.json','w',encoding='utf-8') as fp:
fp.write(content)
#程序的入口
if __name__ == '__main__':
start_page=int(input('請輸入起始頁碼'))
end_page=int(input('請輸入結束頁碼'))
for page in range(start_page,end_page+1):
# 每一頁都有自己的請求對象的定制
request=create_request(page)
# 獲取響應的數據
content=get_content(request)
# 下載
down_load(page,content)

6.ajax的post請求肯德基官網


import urllib.request
import urllib.parse
#base_url='http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=cname'
def create_request(page):
base_url = 'http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=cname'
data={
'cname':'北京',
'pid':'',
'pageIndex':page,
'pageSize':10
}
data=urllib.parse.urlencode(data).encode('utf-8')
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'
}
request=urllib.request.Request(url=base_url,headers=headers,data=data)
return request
def get_content(request):
response=urllib.request.urlopen(request)
content=response.read().decode('utf-8')
return content
def down_load(page,content):
fp=open('kfc_'+str(page)+'.json','w',encoding='utf-8')
fp.write(content)
fp.close()
if __name__ == '__main__':
start_page=int(input('請輸入起始頁碼'))
end_page = int(input('請輸入結束頁碼'))
for page in range(start_page,end_page+1):
# 請求對象的定制
request=create_request(page)
# 獲取網頁源碼
content=get_content(request)
# 下載
down_load(page,content)

7.微博的cookie登錄

#適用場景：數據采集的時候，需要繞過登錄，然後進入到某個頁面
import urllib.request
url='https://weibo.com/u/5494282906'
headers={
# 'authority': 'weibo.com',
# 'method': 'GET',
# 'path': '/ajax/profile/info?uid=5494282906',
# 'scheme': 'https',
'accept': 'application/json, text/plain, */*',
# 'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9',
'client-version': '2.34.75',
# cookie中攜帶著你的登錄信息 如果有登陸之後的cookie 那麼我們就可以攜帶者cookie進入到任何頁面
'cookie': 'SINAGLOBAL=4321438772735.1265.1638876314958; UOR=,,www.huya.com; XSRF-TOKEN=0Y1gSmTH7wyvdhpyilfV_vlf; PC_TOKEN=10a79267b9; login_sid_t=356705c36e6e3ef551138a832110c242; cross_origin_proto=SSL; WBStorage=4d96c54e|undefined; _s_tentry=weibo.com; Apache=2620855400086.0684.1659428891597; ULV=1659428891603:5:1:1:2620855400086.0684.1659428891597:1658372773115; wb_view_log=1920*10801; SUB=_2A25P7JBkDeRhGeNK4lYT-CzFyzqIHXVsm4asrDV8PUNbmtANLVfxkW9NSS8zowa0jAsEWFtjMyrrJhhbJb1BXURy; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WhSQTOOaS3AEmM3FHDZyNqZ5JpX5KzhUgL.Fo-X1KBE1hz4ehq2dJLoIE-LxKnLBoqL1h-LxKMLB.2LBKMLxK.L1hML1K2LxKML12eLBoxu; ALF=1690964916; SSOLoginState=1659428917; WBPSESS=U2WFjk9l_oENTuN-ANFxmzGiwhSrHzYTavgzgmNaP5OU_qgTbyEYEFAvw7wHHNGa2WHU1KDVGkSTJwj61IcxVEYa22hAhm0IFE0Ig-zzZSCZhGbs0dG4VRbdhnYzcsjhdnx-e4jptHJ2HLHmIs7HIQ==',
# referer 判斷當前路徑是不是由上一個路徑進來的 一般情況下是做圖片防盜鏈
'referer': 'https://weibo.com/u/5494282906',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'server-version': 'v2022.08.01.2',
'traceparent': '00-2841b7aa70142fce9b3da2c5022e1be5-4a79deee3f4f0e4e-00',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36 QIHU 360SE',
'x-requested-with': 'XMLHttpRequest',
'x-xsrf-token': '0Y1gSmTH7wyvdhpyilfV_vlf',
}
#請求對象的定制
request=urllib.request.Request(url=url,headers=headers)
#模擬浏覽器向服務器發送請求
response=urllib.request.urlopen(request)
#獲取相應數據
content=response.read().decode('utf-8')
#將數據存儲到本地
fp=open('weibo.html','w',encoding='utf-8')
fp.write(content)
fp.close()

8.handle處理器的基本使用


#需求 需要handler來訪問百度 獲取網頁源碼
import urllib.request
url = 'https://baidu.com'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'
}
request=urllib.request.Request(url=url,headers=headers)
#handler build_opener open
# 1.獲取hanlder對象
handler=urllib.request.HTTPHandler()
# 2.獲取opener對象
opener=urllib.request.build_opener(handler)
# 3.調用Open方法
response=opener.open(request)
content=response.read().decode('utf-8')
print(content)

9.代理服務器

1.代理的常用功能

1.突破自身ip訪問限制，訪問國外站點

2.訪問一些單位或團體內部資源

擴展：某大學FTP（前提是該代理地址在該資源的允許訪問范圍之內），使用教育網內地址免費代理服務器，就可以用於對教育網開房的各類FTP下載上傳，以及各類資料查詢共享等服務

3.提高訪問速度

擴展：通常代理服務器都設置一個較大的硬盤緩沖區，當有外界的信息通過時，同時也將其保存到緩沖區中，當其他用戶再訪問相同的信息時，則直接由緩沖區取出信息，傳給用戶，以提高訪問速度

4.隱藏真實ip

擴展：上王者也可以通過這種方法隱藏自己的ip，以免遭受攻擊

2.代碼配置代理

創建Reuqest對象

創建ProxyHandler對象

用handler對象創建opener對象

使用opener.open函數發送請求

import urllib.request
url='http://www.baidu.com/s?wd=ip'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36',
}
#請求對象定制
request=urllib.request.Request(url=url,headers=headers)
#模擬浏覽器訪問服務器
#response=urllib.request.urlopen(request)
proxies={
'http':'221.4.241.198:9091'
}
#handler build_opener open
handler=urllib.request.ProxyHandler(proxies=proxies)
opener=urllib.request.build_opener(handler)
response=opener.open(request)
#獲得相應信息
content=response.read().decode('utf-8')
#保存
fp=open('daili.html','w',encoding='utf-8')
fp.write(content)
fp.close()