数据库
首页 > 数据库> > 使用selenium爬取51Job职位信息 入库mongoDB

使用selenium爬取51Job职位信息 入库mongoDB

作者:互联网

selenium_51job_com.py

#!/usr/bin/env python3
# coding=utf-8
# Version:python3.6.1
# File:51job_com.py
# Author:LGSP_Harold
import pymongo
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from lxml import etree
import time


class HandleWebdriver:
    def __init__(self):
        # 设置无头模式
        options = Options()
        options.add_argument('--headless')

        self.browser = webdriver.Firefox(firefox_options=options)
        # self.browser.maximize_window()

    def handle_job(self):
        # 打开目的地址
        self.browser.get(
            'https://search.51job.com/list/000000,000000,0000,00,9,99,+,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=')

        # 通过WebDriverWait进行显式等待,等待搜索框
        if WebDriverWait(self.browser, 5, 0.5).until(EC.presence_of_element_located((By.ID, 'keywordInput'))):
            # 外部获取输入岗位信息
            input_keyword = input('请输入要查找的岗位:')
            # 将要查找的信息发送到搜索框
            self.browser.find_element_by_id('keywordInput').send_keys(input_keyword)
            # 点击搜索
            self.browser.find_element_by_id('search_btn').click()

        if WebDriverWait(self.browser, 5, 0.5).until(EC.presence_of_element_located((By.CLASS_NAME, 'j_joblist'))):
            # 查看网页源代码
            # print(self.browser.page_source)
            while True:
                time.sleep(2)
                self.handle_parse(self.browser.page_source)
                try:
                    if self.browser.find_element_by_xpath('//li[@class="next"]/a'):
                        self.browser.find_element_by_xpath('//li[@class="next"]/a').click()
                except:
                    break

            self.browser.quit()

    def handle_parse(self, page_source):
        html_obj = etree.HTML(page_source)
        items = html_obj.xpath('//div[@class="j_joblist"]/div[@class="e"]')
        data_list = []
        for item in items:
            data = {}
            data['job_name'] = item.xpath('.//a/p[@class="t"]/span[@class="jname at"]/text()')[0]
            data['time'] = item.xpath('.//a/p[@class="t"]/span[@class="time"]/text()')[0]
            try:
                data['money'] = item.xpath('.//a/p[@class="info"]/span[@class="sal"]/text()')[0]
            except:
                data['money'] = '面议'
            data['address'] = item.xpath('.//a/p[@class="info"]/span[@class="d at"]/text()')[0]
            try:
                tags = item.xpath('.//a/p[@class="tags"]/span/i/text()')
                text = ''
                for tag in tags:
                    text += tag + ' | '
                data['tags'] = text
            except:
                data['tags'] = '暂无'
            data_list.append(data)
        # print(data_list)
        self.handle_mongodb(data_list)

    def handle_mongodb(self, data_list):
        client = pymongo.MongoClient('mongodb://admin:admin@127.0.0.1:27017')
        db = client['db_51job_com']
        collections = db['collections_51job']
        collections.insert_many(data_list)


selenium = HandleWebdriver()
selenium.handle_job()

 

标签:mongoDB,51Job,selenium,self,text,data,class,browser
来源: https://www.cnblogs.com/Harold-Hua/p/15169297.html