程序師世界是廣大編程愛好者互助、分享、學習的平台,程序師世界有你更精彩!
首頁
編程語言
C語言|JAVA編程
Python編程
網頁編程
ASP編程|PHP編程
JSP編程
數據庫知識
MYSQL數據庫|SqlServer數據庫
Oracle數據庫|DB2數據庫
您现在的位置: 程式師世界 >> 編程語言 >  >> 更多編程語言 >> Python

51job crawler + data visualization Python

編輯:Python

1. Login module :

It was used Xpath and selenium, Finally, it was reversed by the notch slider verification code , The success rate of gap verification code is extremely low .

from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from PIL import Image
import time
from time import sleep
global web,wait
web=webdriver.Firefox()# Open the browser
wait = WebDriverWait(web, 20)
def denglu():
web.get("https://login.51job.com/login.php?loginway=0&isjump=0&lang=c&from_domain=i&url=http%3A%2F%2Fsearch.51job.com%2Flist%2F360000%2C000000%2C0000%2C00%2C9%2C99%2C%252B%2C2%2C1.html%3Flang%3Dc%26postchannel%3D0000%26workyear%3D99%26cotype%3D99%26degreefrom%3D99%26jobterm%3D99%26companysize%3D99%26ord_field%3D0%26dibiaoid%3D0%26line%3D%26welfare%3D") # In the browser address bar , Enter the website
#web.maximize_window()# Full screen maximization window
web.find_element_by_xpath('//*[@id="loginname"]').send_keys('18658607893')
web.find_element_by_xpath('//*[@id="password"]').send_keys('cjhcjh123')
web.find_element_by_xpath('//*[@id="isread_em"]').click()
web.find_element_by_xpath('/html/body/div[4]/div/div/span').click()
sleep(3)
web.find_element_by_id('login_btn_withPwd').click()
# A screenshot of an element
def save_pic(obj, name):
try:
pic_url = web.save_screenshot('.\\51job.png')
print("%s: Screenshot successful !" % pic_url)
# Get element location information
left = obj.location['x'] * 1.25 # The coefficient obtained by comparing the original picture with the actual picture
top = obj.location['y'] * 1.25
right = left + obj.size['width'] * 1.25
bottom = top + obj.size['height'] * 1.25
print(' chart :' + name)
print('Left %s' % left)
print('Top %s' % top)
print('Right %s' % right)
print('Bottom %s' % bottom)
print('')
im = Image.open('.\\51job.png')
im = im.crop((left, top, right, bottom)) # Element clipping
file_name = '51job_' + name + '.png'
im.save(file_name) # Element screenshots
except BaseException as msg:
print("%s: Screenshot failed !" % msg)
# Set element visibility
def show_element(element):
web.execute_script("arguments[0].style=arguments[1]", element, "display: block;")
# The setting element is not visible
def hide_element(element):
web.execute_script("arguments[0].style=arguments[1]", element, "display: none;")
def cut():
c_background = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'canvas.geetest_canvas_bg.geetest_absolute')))
c_slice = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'canvas.geetest_canvas_slice.geetest_absolute')))
c_full_bg = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'canvas.geetest_canvas_fullbg.geetest_fade.geetest_absolute')))
hide_element(c_slice)
save_pic(c_background, 'back') # Hide slider
show_element(c_slice)
save_pic(c_slice, 'slice') # be-all
show_element(c_full_bg)
save_pic(c_full_bg, 'full') # Hide all
# Determine whether the pixels are the same
def is_pixel_equal(bg_image, fullbg_image, x, y):
"""
:param bg_image: (Image) Gap picture
:param fullbg_image: (Image) Full picture
:param x: (Int) Location x
:param y: (Int) Location y
:return: (Boolean) Whether the pixels are the same
"""
# Get the pixels of the notch image ( according to RGB Format )
bg_pixel = bg_image.load()[x, y]
# Get the pixels of the complete picture ( according to RGB Format )
fullbg_pixel = fullbg_image.load()[x, y]
# Set a decision value , If the difference between the pixel values exceeds the decision value, the pixel is considered to be different
threshold = 120
# Judge the difference between the colors of pixels ,abs() Used to take the absolute value
if (abs(bg_pixel[0] - fullbg_pixel[0] < threshold) and abs(bg_pixel[1] - fullbg_pixel[1] < threshold) and abs(bg_pixel[2] - fullbg_pixel[2] < threshold)):
# If the difference is within the judgment value , The return is the same pixel
return True
else:
# If the difference is outside the judgment value , Returns pixels that are not the same
return False
# Calculate the sliding distance
def get_distance(bg_image, fullbg_image):
'''
:param bg_image: (Image) Gap picture
:param fullbg_image: (Image) Full picture
:return: (Int) Distance between notch and slider
'''
# The initial position of the slider
distance = 0
# Traverse the abscissa of pixels
for i in range(distance, fullbg_image.size[0]):
# Traverse the vertical coordinates of pixels
for j in range(fullbg_image.size[1]):
# If not the same pixel
if not is_pixel_equal(fullbg_image, bg_image, i, j):
# The horizontal axis coordinate is the distance the slider needs to move
return i
# Crack slider validation
def slide():
distance=get_distance(Image.open('.\\51job_back.png'),Image.open('.\\51job_full.png'))/1.25 # The coefficient of comparison between the original drawing and the actual drawing shall be removed
try:
slider=wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'div.geetest_slider_button'))) # Find the slider
if slider:
print("==== With slider verification =====")
action_chains = webdriver.ActionChains(web)
# Click on , Ready to drag
action_chains.click_and_hold(slider)
action_chains.pause(0.2)
action_chains.move_by_offset(distance,0)
action_chains.pause(0.6)
action_chains.move_by_offset(36,0) # Add correction process
action_chains.pause(0.6)
action_chains.release()
action_chains.perform() # Release the slider
time.sleep(5)
else:
print("=== No slider validation ===")
except Exception as e:
print("==="+str(e))
denglu()
cut()
slide()

2. Search for + Crawling module Not combined with the first login module Because the login success rate is really low ( It involves a small part of data cleaning , About the salary part )

import re
from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from PIL import Image
import time
import json
from time import sleep
import requests
global web,wait
web=webdriver.Firefox()# Open the browser
wait = WebDriverWait(web, 20)
web.get("https://search.51job.com/list/360000,000000,0000,00,9,99,%2B,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=") # In the browser address bar , Enter the website
web.maximize_window()# Full screen maximization window
print(' Please enter the position you want to search ')
s = input()
def sousuo():
sleep(3)
#web.find_element_by_xpath('/html/body/div[1]/div[4]/div/p/a[2]').click()
web.find_element_by_xpath('/html/body/div[2]/div[2]/div[1]/div[1]/div[2]/div[2]').click()
web.find_element_by_xpath('/html/body/div[2]/div[2]/div[1]/div[2]/div/div[2]/div[1]/div[2]/div/table/tbody[1]/tr/td[2]/em').click()
web.find_element_by_xpath('/html/body/div[2]/div[2]/div[1]/div[2]/div/div[2]/div[1]/div[2]/div/table/tbody[2]/tr/td[2]/em').click()
web.find_element_by_xpath('/html/body/div[2]/div[2]/div[1]/div[2]/div/div[2]/div[1]/div[2]/div/table/tbody[4]/tr/td[4]/em').click()
web.find_element_by_xpath('/html/body/div[2]/div[2]/div[1]/div[2]/div/div[2]/div[1]/div[3]/table/tbody/tr/td/em').click()
web.find_element_by_xpath('/html/body/div[2]/div[2]/div[1]/div[2]/div/div[3]/span').click()
sleep(2)
web.find_element_by_xpath('//*[@id="keywordInput"]').send_keys(s)
sleep(0.5)
web.find_element_by_xpath('//*[@id="search_btn"]').click()
def pachu():
f=open('C:\\Users\\55151\\Desktop\\python Curriculum \\51jobpachong.csv','a+')
f.write('{},{},{},{},{},{},{}\n'.format(' Position ',' company ',' Wages ',' Workplace ',' Education ',' Update time ',' details '))
#f = open('C:\\Users\\55151\\Desktop\\python Curriculum \\51jobpachong.csv', 'a+')
headers={'Cookie': '_uab_collina=165432020619222940563756; acw_tc=ac11000116543202028027511e010fbf52b6b519923541b8e77a0e3850f097; guid=b6ba05a019649425e1e1883cbfeb8b3f; search=jobarea%7E%60360000%2C020000%2C080200%2C170200%7C%21ord_field%7E%600%7C%21recentSearch0%7E%60360000%2C020000%2C080200%2C170200%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FApython%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21recentSearch1%7E%60360000%2C020000%2C080200%2C170200%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FA%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21recentSearch2%7E%60360000%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FA%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21; nsearch=jobarea%3D%26%7C%26ord_field%3D%26%7C%26recentSearch0%3D%26%7C%26recentSearch1%3D%26%7C%26recentSearch2%3D%26%7C%26recentSearch3%3D%26%7C%26recentSearch4%3D%26%7C%26collapse_expansion%3D; ssxmod_itna=YqIx2DBDyii=iteGHD8Qbq0IKpQD0DbAGQG=CcD0vxpeGzDAxn40iDto=ThloZA7u5Pf8lO+hxZWEDedpsWpOAfOe6+r4GLDmKDylYxheDx1q0rD74irDDxD3DbRdDSDWKD9D0bMgy2VKDbxi36xDbDiWkvxGCDeKD0PqHDQKDucKPslrkYRPw9ro1eGAxKnKD9OoDsc0fC4oLU/2GfCKDX1dDvhv1ZgPpDB4C/pyGbQDPfmlFsxDOy4017zqDWcIxsel5DjTk1gaKWB=DBoUe/j+x5G0q8b0gqlGKOlGZeBqxNQaq6Nm4O2PH7DDDfPGhdmDxD=; ssxmod_itna2=YqIx2DBDyii=iteGHD8Qbq0IKpQD0DbAGQG=7DnFfbyqDs=KeDL7ZiybDqnRDq=zKmrqc5eXtAGIxq9Qigc5kcKYMpaCd3br2dze3rkMD3YxcGOwfeYeu80KsbNBrKmli7vKhNc8pxW0Qiie8pkWeW+Yhwie8GHKiAiO0WmaSwT8lbTxExTOlRbt8gd7+qmXm7zpU3fhU33ptpioI36KQaflj34G/cm8SpNz/d2SScWYVpIp3e=E6oXGRezdgh2gIwUhRT8xF=uiMMIcn0bk/7pKQA=nQAXlFgDYlxvtgTp0QWihafmZq1YxToBHL+sk3=WSH3earfsVbOiuDb21kRF93bc04pRhhd8c7Y+S1V8whT+OYWB+NUhFrk75lrqLT/0FBozDWmjbYCMdw81BObfq7n7OTw7M3FKdzqaY2dmBOKPT88qEquxxO1uQrZQTPcOnjE8kObCHDm7pODDkadSdOYFALk=GEtldY7aCDjBqLk4HhQOz41DI54U+F6+SfxEDDwZGKnDc9E5s6xiAAA=elo3lxCm9M=K=uO/0EEnherhZWIicoTkwZBK6ph1iwy9OSZOOww8kqbue=G2eavQeo=ZkIih99vX++nn+MFXmBmE0C8TxDjKDeuz/rxuhTwYubgCphTAi3GBz0BqEbmC/zkNHCbZ=r/bo/jTlV7D4D===','User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:100.0) Gecko/20100101 Firefox/100.0'}
for page in range(1,497):
print(' Climbing to the top {} page \n'.format(page))
url='https://search.51job.com/list/360000%252c020000%252c080200%252c170200,000000,0000,00,9,99,python,2,{}.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='.format(page)
ree=requests.get(url=url,headers=headers).text
#print(ree)
r=re.findall('window.__SEARCH_RESULT__ = (.*?)</script>',ree,re.S)
#print(r)
sss=' '.join(r)
# #print(sss)
infodict=json.loads(sss)
print(infodict)
engine_jds=infodict['engine_jds']
for i in engine_jds:
job_name=i['job_name']
company_name=i['company_name']
providesalary_text=i['providesalary_text']
for k in i['attribute_text']:
if k==' junior college ' or k==' Undergraduate ' or k ==' master ' :
attribute_text=k
n = 0
if (" ten thousand / month " in providesalary_text):
Num = (re.findall(r"\d+\.?\d*", providesalary_text)) # Extract the number from the salary string
n = int((float(Num[0]) + float(Num[1])) * 10000 / 2) # Average the wage range
elif (" thousand / month " in providesalary_text):
Num = (re.findall(r"\d+\.?\d*", providesalary_text))
n = int((float(Num[0]) + float(Num[1])) * 1000 / 2)
elif (' element / God 'in providesalary_text ):
n=0
if n==0:
providesalary_text=' Face to face discussion '
else:
providesalary_text= str(n)+' element '
workarea_text=i['workarea_text'][0:2]
updatedate=i['updatedate']
job_href=i['job_href']
f.write('{},{},{},{},{},{},{}\n'.format(job_name,company_name,providesalary_text,workarea_text,attribute_text,updatedate,job_href))
sleep(0.1)
print('done!\n')
f.close()
sousuo()
pachu()

3. Visualization module :
 

import csv
import wordcloud as wc
import jieba
from PIL import Image
import numpy as np
from matplotlib import pyplot as plt
from pyecharts.charts import Map
from pyecharts.charts import Bar
from pyecharts import options
from pyecharts.commons.utils import JsCode
import csv
import pandas as pd
from pyecharts.charts import Pie
from pyecharts import options as opts
data_x = []
data_y = []
province_date = []
data_number_y = []
ss=''
with open("C:\\Users\\55151\\Desktop\\python Curriculum \\51jobpachong.csv") as fp:
read = csv.reader(fp) # Read file contents
for i in read:
ss+=i[0]
#print(i[2]+'\n')
if i[2]==' Wages ' or i[2]==' Face to face discussion ':
pass
else:
#print(i[2][:len(i[2])-1])
province = []
data_x.append(i[3])
data_y.append((i[2][:len(i[2])-1]))
data_number_y.append(i[3])
province.append(i[3])
province.append(i[2][:len(i[2])-1])
province_date.append(province)
res = jieba.lcut(ss) # Chinese word segmentation
text = " ".join(res) # Connect all the words with spaces
mask = np.array(Image.open("C:\\Users\\55151\\Desktop\\ The word cloud .jpg")) # Specify the word cloud effect
word_cloud = wc.WordCloud(font_path="msyh.ttc", mask=mask) # Create a word cloud object
word_cloud.generate(text) # Generate the words
plt.imshow(word_cloud) # Show word cloud
word_cloud.to_file("wordcloud.png") # Save as a picture
plt.show() # display picture
data_x = data_x[0: 30]
print(data_x)
date_many_map = (# Salary crystal histogram
Bar()
.add_xaxis(data_x)
.add_yaxis(" Province ", data_y, category_gap="3%")
.set_series_opts(
itemstyle_opts={
"normal": {
"color": JsCode(
"""new echarts.graphic.LinearGradient(0, 0, 0, 1, [{
offset: 0,
color: 'rgba(0, 244, 255, 1)'
}, {
offset: 1,
color: 'rgba(0, 77, 167, 1)'
}], false)"""
),
"barBorderRadius": [50, 50, 50, 50],
"shadowColor": "rgb(0, 160, 221)",
}
}
)
.set_global_opts(title_opts=options.TitleOpts(title=" The salary of each province "),
xaxis_opts=options.AxisOpts(
name=' salary ',
name_location='middle',
name_gap=45, # Distance between label and axis , The default is 20, It's better not to set 20
name_textstyle_opts=options.TextStyleOpts(
font_family='Times New Roman',
font_size=10 # Label font size
)),
yaxis_opts=options.AxisOpts(
name=' thousand / month ',
name_location='middle',
name_gap=30,
split_number = 5,
max_=30000,
#length=100,
name_textstyle_opts=options.TextStyleOpts(
font_family='Times New Roman',
font_size=10
# font_weight='bolder',
)),
# toolbox_opts=opts.ToolboxOpts() # Tool options
)
.render("C:\\Users\\55151\\Desktop\\ Salary crystal histogram .html")
)
print(" Salary crystal histogram can be created !!!")
data = pd.read_csv('C:\\Users\\55151\\Desktop\\python Curriculum \\51jobpachong.csv', error_bad_lines=False, encoding='gbk')
counts = data[' Education '].value_counts()
l1 = counts.index.tolist()
l2 = counts.values.tolist()
# Data format sorting
data_pair = [list(z) for z in zip(l1, l2)]
(
# Set icon background color
Pie(init_opts=opts.InitOpts(bg_color="rgba(206, 206, 206, 0.3)"))
.add(
# Series name , That is, the name of the pie chart
series_name=" Academic analysis ",
# Series data items
data_pair=data_pair,
# The radius of the pie , Set to default percentage , Relative to the smaller of the container height and width
radius="55%",
# The center of the pie chart , The first item is relative to the width of the container , The second item is the height relative to the container
center=["50%", "50%"],
# Tag configuration item
label_opts=opts.LabelOpts(is_show=False, position="center"),
)
# Global settings
.set_global_opts(
# Set title
title_opts=opts.TitleOpts(
# name
title=" Analysis on the proportion of educational background ",
# The position of the assembly from the left side of the container
pos_left="center",
# The pixel value of the component above the container
pos_top="20",
# Set the title color
title_textstyle_opts=opts.TextStyleOpts(color="#000"),
),
# Legend configuration item , Parameters Whether to display the components in the figure
legend_opts=opts.LegendOpts(
is_show=True,
# Vertical display
orient="vertical",
# From the left 5%
pos_left="5%",
# From the top 60%
pos_top="60%",
),
)
# Series settings
.set_series_opts(
tooltip_opts=opts.TooltipOpts(
trigger="item", formatter="{a} <br/>{b}: {c} ({d}%)"
),
# Set label color
label_opts=opts.LabelOpts(color="#000"),
)
.render('C:\\Users\\55151\\Desktop\\xueli.html')
)

 


  1. 上一篇文章:
  2. 下一篇文章:
Copyright © 程式師世界 All Rights Reserved