您现在的位置：程式師世界 >> 編程語言 > >> 更多編程語言 >> Python

【python學習8】字符串處理,數據驗證和處理,正則表達式

編輯：Python

（一）字符串處理的相關方法

注意是相關方法,不是函數

#大小寫轉換
s1="HelloWorld"
new_s1=s1.lower() #全部轉為小寫，並產生新的字符串賦給new_s1
new__s1=s1.upper() #全部轉為大寫，並產生新的字符串賦給new__s1
#字符串的分割
s2="[email protected]"
new_s2=s2.split("@") #分割之後是列表類型
print("郵箱名",new_s2[0],"郵箱域名",new_s2[1])
#統計子串在指定字符串的次數
print(s1.count("o"))
#檢索操作
print(s1.find("o")) #"o"首次出現的位置索引
print(s1.index("o"))
print(s1.find("p")) #沒找到，故結果為-1
print(s1.find("p")) #該語句會報錯，ValueError,因為沒找到
#判斷前綴和後綴
print(s1.startswith("H")) #輸出True
print(s1.startswith("p")) #輸出False
print("demo.py".endswith(".py")) #輸出True

s="HelloWorld"
#替換
new_s=s.replace("0","你好") #替換後產生新的字符串，賦給new_s
#字符串在指定寬度范圍內居中
print(s.center(20,*))
#去除字符串的左右空格
s1=" Hello World "
print(s.strip()) #默認去除空格，去除之後會產生新的字符串
print(s.lstrip()) #去除左側空格
print(s.rstrip()) #去除右側空格
#去除指定的字符，與指定字符的順序無關
s3="dl-helloworld"
print(s3.strip("ld")) #輸出是"-hellowor",刪除指定的“字符”，與“ld”和“dl”的順序無關

（二）格式化字符串

#(1)使用占位符格式化字符串
name="馬冬梅"
age=18
score=98.3
print("姓名:%s,年齡:%d,成績:%f" %(name,age,score)) #注意寫法，後面是%元組形式
print("姓名:%s,年齡:%d,成績:%.2f" %(name,age,score)) #score的輸出保留兩位小數
#(2)使用f-string格式化字符串
print(f"姓名：{name},年齡：{age},成績：{score}") #""之前必須要有參數f,否則就是簡單的字符串，且以{}表明被替換的字符串
#(3)使用字符串的format()方法
print("姓名:{0},年齡:{1},成績:{2}".format(name,age,score))#注意前面花括號{}中的數字對應format()中的參數位置(位置從0開始排)，卡槽對應相應位置
print("姓名:{2},年齡:{0},成績:{1}".format(age,score,name))

#方法format()的格式控制，前三位控制
s="helloworld"
print("{0:*<20}".format(s)) #0對應format()中的s(卡槽對應位置)，接著按照標准的格式控制寫
#格式控制中的千位分隔符,(只適用整數和浮點數)
print("{0:,}".format(9873256)) #輸出9,873,256
print("{0:,}".format(9873256.236)) #輸出9,873,256.236
#浮點數小數部分的精度
print("{0:.2f}".format(3.1415926535)) #輸出3.14,是四捨五入的保留
#或者字符串的最大顯示長度
print("{0:.5}".format("helloworld")) #輸出hello,輸出寬度為5
#最後一位控制:類型控制
#整數類型
a=425
print("{0:b},{0:c},{0:d},{0:o},{0:x},{0:X}".format(a))#模板中的卡槽是0所以format()只需要一個參數
#類型c是對應unicode字符，類型x對應16進制小寫字符，類型X對應16進制大寫字符
#浮點數類型
b=3.141592
print("{0:.2f},{0:.2E},{0:.2e},{0:.2%}".format(b))
#E和e是科學計數法，%是以百分數形式輸出且以百分數保留小數

（三）字符串的編碼與解碼

errors有三個可選：
strict指嚴格操作，不符合直接報錯
ignore指忽略錯誤
replace指替換，不認識直接替換成 "?" 號

s="偉大的中國"
#編碼
scode=s.encode("gbk")
s_utf-8=s.encode("utf-8")
#解碼
print(bytes.decode(scode,"gbk"))
print(bytes.decode(s_utf-8,"utf-8")) #必須這種寫法?,不能寫成scode.decode()嗎?

(四)數據的驗證

str.isdigit(): 驗證所有字符都是 十進制的阿拉伯數字
str.isnumeric(): 驗證所有字符都是數字,包括阿拉伯數字，羅馬數字，漢字大寫數字(壹貳)，漢字數字(一二)，二進制數字不行
str.isalpha(): 判斷所有字符都是字母(包括中文)，中文數字也行，但阿拉伯數字不行
str.isalnum: 判斷所有字符都是數字或字母(包括中文)，中文數字也行，阿拉伯數字也行
str.islower(),str.isupper(): 判斷所有字符都是大寫或小寫,由於中文不分大小寫，故只判斷英文字母的大小寫
str.istitle(): 單詞之間用空格分割，否則認為是一個單詞，由於中文不分大小寫，故只判斷英文字母的大小寫

#判斷所有字符都是數字(十進制的阿拉伯數字)
print("123".isdigit()) #True
print("一二三".isdigit()) #False
print("0b1001".isdigit()) #False
print("IIIIII".isdigit()) #False羅馬數字
print("壹貳叄"isdigit()) #False
#判斷所有字符都是數字(羅馬數字，十進制阿拉伯，漢字數字(一二三和壹貳叄))
print("123".isnumeric()) #True
print("一二三".isnumeric()) #True
print("0b1001".isnumeric()) #False
print("IIIIII".isnumeric()) #True 羅馬數字
print("壹貳叄"isnumeric()) #True
#判斷都是字母(英文中文)
print("hello你好".isalpha()) #True
print("hello你好123".isalpha()) #False
print("hello你好一二三".isalpha()) #True
print("hello你好IIIIII".isalpha()) #False
#判斷所有字符都是數字和字母(英文和中文)
print("hello你好123".isalnum()) #True
print("hello你好123...".isalnum()) #Falde
print("hello你好一二三".isalnum()) #True
print("hello你好壹貳叄".isalnum()) #True
print("hello你好IIIIII".isalnum()) #True
#判斷首字母大寫
print("Hello".istitle()) #True
print("HelloWorld".istitle()) #False
print("Helloworld".istitle()) #True
print("Hello world".istitle()) #False
print("Hello World".istitle()) #True
print("Hello你好".istitle()) #True
#判斷所有字符都是小寫
print("Hello".islower()) #False
print("hello".islower()) #True
print("hello你好".islower()) #True,因為中文沒有大小寫，故只判斷字母大小寫
#同理可判斷所有字符都是大寫
#判斷是否都是空白字符
print("\t".isspace()) #True
print("\n".isspace()) #True
print(" ".isspace()) #True

（五）數據處理

1，字符串的拼接操作

使用“+”拼接 和 使用join()方法拼接 是最常用的，join()方法也用於 列表和元組的拼接

s1="hello"
s2="world"
#(1)使用“+”拼接
print(s1+s2)
#(2)使用join()方法拼接,使用列表進行拼接
print("".join["hello","world"]) #對空字符串使用join()方法，輸出"hello world"
#join()方法是在列表中的每個字符串添加一個符號拼接在一起
print("*".join["hello","world","php"]) #輸出“hello*world*php”,首個字符串沒有符號
#(3)直接拼接
print("hello""world")
#(4)使用格式化字符串拼接
print("%s%s" %(s1,s2)) #輸出“helloworld”
print(f"{s1}{s2}") #輸出“helloworld”
print("{0}{1}".format(s1,s2)) #輸出“helloworld”

2，字符串的去重

s="alknvlakgakdvnpawirugsagkj"
#(1)使用for循環和not in方法去重
new_s1=""
for item in s:
if item not in new_s1: #判斷s中的字符是否在new_s1中存在(或者說是重復)
new_s1+=item #因為s和new_s1都是字符串，所以“+”進行拼接操作
print(new_s1)
#(2)使用索引，range()函數，for循環，not in
new_s2=""
for i in range(len(s)):
if s[i] not in new_s2: #索引元素進行判斷
new_s2+=s[i]
print(new_s2)
#(3)通過集合去重+列表排序+join()方法拼接
new_s3=set(s) #new_s3是集合類型
lst=list(new_s3) #轉為列表
lst.sort(key=s.index) #sort()方法排序，使用參數key(指定比較排序的鍵)
print("".join(lst)) #join()方法拼接

3，列表元素的去重

lst=["金星","木星","水星","火星","土星","金星","木星","水星","火星","土星"]
new_lst=[]
#(1)用for循環遍歷+ not in
for item in lst:
if item not in new_lst:
new_lst.append(item) #添加item到new_lst中
#(2)for+range()+not in
new_lst2=[]
for i in range(len(lst)):
if lst[i] not in new_lst2:
new_lst2.append(lst[i])
#(3)利用集合去重，轉為列表排序(用key參數)
s_lst=set(lst)
new_lst3=list(s_lst)
new_lst3.sort(key=lst.index)

(六)正則表達式

1，正則表達式的初次認識

（七）內置模塊re的使用(還是處理字符串)

①pattern是模式字符串,也可以說 匹配規則，string是待匹配的字符串，flag是標志位(控制匹配的方式，例如是否區分大小寫，是否利用多行模式)
②用前面的 pattern匹配string,是否滿足規則
③pattern 是 按照前面正則表達式的格式書寫

#re模塊的使用，導入re模塊
import re
pattern=r"\d\.\d+" #r表示元字符，指python中的轉義字符不起作用
s="i study python everyday"
match=re.match(pattern,s,re.I)#參數re.I是忽略大小寫
print(match) #輸出是None
s2="3.10python i study"
match2=re.match(pattern,s2,re.I)
print(match2) #輸出<re.match.object>
print("匹配的起始位置",match2.start()) #輸出0
print("匹配的結束位置"，match2.end()) #輸出4，因為在位置4沒匹配到，故結束
print("匹配的位置區間",match2.span) #輸出（0.4）
print("待匹配的字符串",match2.string) #輸出3.10python i study，即match方法中的string
print("匹配的數據",match2.group()) #輸出3.10

findall()方法的結果是列表，如果沒有匹配項則結果為空列表

import re
pattern=r"\d\.\d+"
s="i study python 3.10 every day python2.1 i love u"
s2="4.10python i study"
s3="i study python every day"
match=re.search(pattern.s) #3.10
match2=re.search(pattern.s2) #4.10
match3=re.search(pattern.s3) #None
lst1=re.findall(pattern,s) #["3.10","2.1"]
lst2=re.findall(pattern,s2) #["4.10"]
lst3=re.findall(pattern,s3) #[]是空列表

re.sub()方法的結果是 字符串， re.split方法的結果是列表

import re
pattern="黑客|破解|反爬"
s="我想學python,像破解VIP視頻，python無限反爬"
new_s=re.sub(pattern,"***",s) #替換後的結果是字符串
print(new_s) #s中符合pattern的替換為***
s2="https://www.baidu/s?wd=cij&ie=utf-8&tn=baidu"
pattern2="[?|&]"
lst=re.split(pattern2,s2)
print(lst) #結果是列表

部分實戰代碼

#車牌歸屬地
lst=["京A446262","粵C562394"，"津B123965"]
for item in lst:
s=item[0:1]
print(item,"歸屬地",s)

#統計字符串中出現指定字符的次數,只能統計字符不能統計字符串
s="Hellopython,Hellojava,hellophp"
word=input("要統計的字符")
print("{0}在{1}中出現的次數{2}".format(word,s,s.upper().count(word))

#格式化輸出商品的名稱和單價
lst=[
["01","電風扇","美的",500],
["02","洗衣機","TCL",1000],
["03","微波爐","老板",400]
]
print("編號\t\t名稱\t\t品牌\t\t價格")
for item in lst:
for i in item:
print(i,end="\t\t")
print()
#對列表內容格式化輸出
for item in lst:
item[0]="000"+item[0]
item[3]="${:.2f}".format(item[3])

#正則表達式提取有效數據
import re
s="akjfbakjsfkx cjefsdncskdjnjnskdjf"#總之就是從網上復制來的一大串字符，會包含網站信息
pattern="https://img\d{1}.baidu.com/it/u=\d*,\d*&fm=\d*&fmt=auto"
lst=re.findall(pattern,s) #結果是列表
for item in lst:
print(item)