您现在的位置：程式師世界 >> 編程語言 > >> 更多編程語言 >> Python

Python crawls Recruitment Information

編輯：Python

selenium Pit in application

selenium The use of the library allows you to bypass the anti crawl mechanism , It should be that the program fully conforms to the behavior of the browser during operation , Since it is completely consistent with the behavior of the calendar, it will not be easily blocked out , But there are still problems in the application process .
spider When getting data , Is what it can see , Then you specify the data , As long as the programmer father gives the appropriate positioning operation ,spider You can get the data by positioning , But the big guys at the front , Often when writing pages , Sometimes the use of labels changes , So our positioning cannot be accurately positioned , Now , Yours spider There will be a crisis of downtime .
resolvent ：
1. Before you start writing code , Open the web page in advance , To view the , The main observations are , How and where your content is displayed on the page .
2. exception handling , Use exception handling to ensure your spider Will not die , And you can start by throwing an exception , Positioning mark error , Optimize the code in time .
3. Join the waiting mechanism properly when accessing , Appropriate waiting mechanisms , Although the crawling efficiency will be slightly reduced , But stable data acquisition , Can reduce your rework times .

Application example ：

About Liepin ptython Acquisition of position information （ For practice and code testing only ）

utilize chrome Automatic control of , Data acquisition

The main fields obtained are , Corporate name 、 Job title 、 Salary 、 Application requirements .

Finally, the data is stored in the database .

import pymysql
import sys
def save(table):
print('------------------------------')
global conn
conn = pymysql.connect(host='127.0.0.1',
user='root',
passwd='XXX',
port=8080,
charset='utf8')
global cur
cur = conn.cursor()
print(' To obtain the cursor ')
try:
cur.execute("create database lp character set utf8;")
except Exception as e:
print(e)
cur.execute('use lp;')
try:
cur.execute("create table "+table+"(id int,company char(100),job char(200),\
address char(100),salary char(100),ask varchar(5000))character set utf8;"
)
except Exception as e:
print(e)
print(' Create table complete ')
def inser_data(table,id,company,job,address,salary,ask):
sql_insert = 'insert into '+table+'(id,company,job,address,salary,ask) values (%s,%s,%s,%s,%s,%s);'
try:
cur.execute(sql_insert,[id,company,job,address,salary,ask])
except Exception as e:
print(e)
conn.commit()
def my_txt(table,ask):
f = open(table+'.txt','a+',encoding='utf-8')
f.write(ask)
f.close()

'''
All data required by the position is stored locally txt Document making word cloud
Corporate name , The position name and salary fields are all stored in the database
Because the data display method of salary field is “XX-XX” Therefore, all the ranges are stored in the form of strings
'''
from selenium import webdriver
from time import sleep
import random
import re
from lp_spider import save_data
# from lp_spider import py_cloud
start_url = 'https://www.liepin.com/zhaopin/'
def open_url():
global driver
driver = webdriver.Chrome()
driver.get(start_url)
driver.maximize_window()
def get_page(type):
# Invisible waiting , The web page is fully open
driver.implicitly_wait(20)
# Enter the type you want to find
driver.find_element_by_xpath('//*[@id="sojob"]/div[1]/form/div[1]/div/div/div[1]/input').send_keys(type)
# Click to find
driver.find_element_by_xpath('//*[@id="sojob"]/div[1]/form/div[1]/div/div/div[1]/button').click()
# Slide the slider
driver.execute_script('window.scrollBy(0, 500)')
def get_info(table):
global id # label
id = 0
for j in range(1,101):
for i in range(1,41):
global company # Corporate name
global job # Job title
global salary # Salary
global Ask # Job requirements
try:
ty = driver.find_element_by_xpath('//*[@id="sojob"]/div[2]/div/div[1]/div[1]/ul/li['+str(i)+']/i/b').text
except:
ty = ' nothing '
print(ty)
if ty == ' Enterprise ':
#sleep(random.choice(range(5, 15)))
# Open the corresponding page
try:
# Open the corresponding page
driver.find_element_by_xpath('//*[@id="sojob"]/div[2]/div/div[1]/div[1]/ul/li['+str(i)+']/div/div[1]/h3/a').click()
#print(i)
# Jump
print(' Site address ：',end=' ')
print(driver.current_url)
handles = driver.window_handles
driver.switch_to.window(handles[len(handles)-1])
#print(driver.current_url)
driver.implicitly_wait(20)
# Start getting information
try:
company = driver.find_element_by_xpath(
'//*[@id="job-view-enterprise"]/div[1]/div[1]/div[1]/div[1]/div[1]/h3/a[@title]').text
except Exception as e:
print(e)
try:
company = driver.find_element_by_xpath(
'//*[@id="job-hunter"]/div[1]/div[1]/div[1]/div[1]/div/div[1]/h3').text
except Exception as e:
print(e)
company = driver.find_element_by_xpath(
'//*[@id="job-hunter"]/div[1]/div[1]/div[1]/div[1]/div/div[1]/h1[@title]').text
#print(company)
try:
job = driver.find_element_by_xpath('//*[@id="job-view-enterprise"]/div[1]/div[1]/div[1]/div[1]/div[1]/h1').text
except Exception as e:
print(e)
job = driver.find_element_by_xpath('//*[@id="job-hunter"]/div[1]/div[1]/div[1]/div[1]/div/div[1]/h1[@title]').text
#print(job)
#sleep(random.choice(range(1,5)))
try:
salary = driver.find_element_by_xpath('//*[@id="job-view-enterprise"]/div[1]/div[1]/div[1]/div[1]/div[2]/div[1]/div[1]/p[1]').text
salary_m = re.findall('[\u4e00-\u9fa5]+',salary)
if (salary_m[0] == ' Face to face discussion '):
salary = [' Face to face discussion ']
else:
salary = driver.find_element_by_xpath(
'//*[@id="job-view-enterprise"]/div[1]/div[1]/div[1]/div[1]/div[2]/div[1]/div[1]/p[1]').text
if (len(salary)<8):
salary = [salary]
else:
salary = re.findall('[0-9]*.[0-9]*.[\u4e00-\u9fa5]+', salary)
except Exception as e:
print(e)
salary = driver.find_element_by_xpath(
'//*[@id="job-hunter"]/div[1]/div[1]/div[1]/div[1]/div/div[2]/div/div/p[1]').text
if (len(salary) < 8):
salary = [salary]
else:
salary = re.findall('[0-9]*.[0-9]*.[\u4e00-\u9fa5]+', salary)
#print(salary)#!salary After processing, it becomes a dictionary
try:
address = driver.find_element_by_xpath('//*[@id="job-view-enterprise"]/div[1]/div[1]/div[1]/div[1]/div[2]/div[1]/div[1]/p[2]/span/a').text
except Exception as e:
print(e)
try:
address = driver.find_element_by_xpath('//*[@id="job-hunter"]/div[1]/div[1]/div[1]/div[1]/div/div[2]/div/div/p[2]/span').text
except Exception as e:
print(e)
try:
address = driver.find_element_by_xpath('//*[@id="job-view-enterprise"]/div[1]/div[1]/div[1]/div[1]/div[2]/div[1]/div[1]/p[2]/span/text()').text
except Exception as e:
print(e)
address = driver.find_element_by_xpath('//*[@id="job-view-enterprise"]/div[1]/div[1]/div[1]/div[1]/div[2]/div[1]/div[1]/p[2]/span').text
#print(address)
# Move slider
driver.execute_script('window.scrollBy(0,400)')
#sleep(10)
try:
Ask= driver.find_element_by_xpath('//*[@id="job-view-enterprise"]/div[1]/div[1]/div[1]/div[1]/div[3]/div').text
except Exception as e:
Ask = driver.find_element_by_xpath('//*[@id="job-hunter"]/div[1]/div[1]/div[1]/div[1]/div/div[3]/div').text
#Ask = Ask.replace("\n",'')
try:
Ask = Ask.replace(" Job requirements ：", "")
except:
#print(Ask)
pass
try:
Ask = Ask.replace(" Responsibilities ：", "")
except:
#print(Ask)
pass
try:
Ask = Ask.replace(" Job description ：", "")
except:
#print(Ask)
pass
try:
Ask = Ask.replace(" Post requirements ：", "")
except:
#print(Ask)
pass
try:
Ask = Ask.replace(" Responsibility description ：", "")
except:
#print(Ask)
pass
try:
Ask = Ask.replace(" Qualifications ：", "")
except:
#print(Ask)
pass
# print(Ask)
driver.close()
handles = driver.window_handles
sleep(random.choice(range(1, 5)))
driver.switch_to.window(handles[len(handles)-2])
# # Slide the slider
# driver.execute_script('window.scrollBy(0, 145)')
print(j, end='.')
print(i)
#print('————————————————————————————————————————————————————————————————————————' * 10)
save_data.inser_data(table,str(id), company, job, address, salary[0], Ask)
save_data.my_txt(table,Ask)
id = id + 1
except:
pass
else:
print(j, end='.')
print(i,end=' complete ')
#print('————————————————————————————————————————————————————————————————————————'*10)
if i<40:
if ty == ' Enterprise ':
# Slide the slider
driver.execute_script('window.scrollBy(0, 145)')
if ty == ' Hunting ':
driver.execute_script('window.scrollBy(0,141)')
if ty == ' straight ':
driver.execute_script('window.scrollBy(0,145)')
if ty == ' nothing ':
driver.execute_script('window.scrollBy(0,137)')
if ty == ' optimal ':
driver.execute_script('window.scrollBy(0,139)')
try:
driver.find_element_by_xpath('//*[@id="sojob"]/div[2]/div/div[1]/div[1]/div/div/a[8]').click()
except:
driver.execute_script('window.scrollTo(0,0)')# Return to the top of the page
driver.execute_script('window.scrollBy(0,{})'.format(145 * 42))
driver.find_element_by_xpath('//*[@id="sojob"]/div[2]/div/div[1]/div[1]/div/div/a[8]').click()
sleep(random.choice(range(3,5)))
driver.execute_script('window.scrollBy(0, 500)')
save_data.cur.close()
save_data.conn.close()
if __name__ == '__main__':
while(1):
print(' Enter the crawling position category name , Press enter after entering to continue -->',end='')
ty = input()
save_data.save(ty)
open_url()
get_page(ty)
get_info(ty)
#py_cloud.make_cloud('python')
print(' End of climb ')

# The word cloud
from wordcloud import WordCloud
import cv2
import jieba
with open('lp.txt', 'r',encoding='utf-8') as f:
text = f.read()
cut_text = " ".join(jieba.cut(text))
color_mask = cv2.imread('python1.jpg')
cloud = WordCloud(
# Set the font , If you don't specify it, there will be garbled code
font_path=" C:\\Windows\\Fonts\\STXINGKA.TTF",
# font_path=path.join(d,'simsun.ttc'),
# Set background color
background_color='white',
# The shape of the word cloud
mask=color_mask,
# Maximum vocabulary allowed
max_words=10000,
# Maximum font size
max_font_size=100
)
wCloud = cloud.generate(cut_text)
wCloud.to_file('cloud.png')
import matplotlib.pyplot as plt
plt.imshow(wCloud, interpolation='bilinear')
plt.axis('off')
plt.show()