python爬虫开发之“智联招聘”网页爬取

先贴上需求：

 1. 输入起始页 和结束页 爬取智联招聘上 与python相关职业
 2. 爬取的信息包括 就业岗位名称 薪资 地区  公司名称  需求{包括学历和经验}
 3. 爬取的信息以字典形式保存到mongodb数据库中

附上url https://sou.zhaopin.com/?jl=681&kw=python&kt=3 点击 --->>>智联招聘

------------------------------------------分割线---------------------------------------------------------------

打开智联上述url 链接，发现会跳出以下画面：

能用requests 当然用requets的啦！毕竟比较简洁嘛~ 但很可惜，使用requests库是爬取不到信息滴

我还真不信邪从浏览器那儿头部全部复制下带上所有能用的都用了，结果。。显然这是一次失败的尝试。

（当然，有朋友通过抓包，可以找到某个接口，通过对该接口使用 post 或 get 方法同样也可以获取到信息，这也是一种方法。）

这里，我就使用究极方法 selenium + chromedriver 来实现。

https://sou.zhaopin.com/?p=1&jl=489&kw=python&kt=3&sf=0&st=0

观察url 可以发现  p 表示页码  kw表示关键字 jl等其它参数 不知道干啥滴 不过对我们爬取网站并没有影响

接下来就可以大展拳脚咯！！

为了加快爬取速度我采用了多线程的方式进行爬取，使用 4个线程进行下载页面 3个线程进行解析页面并写入到Mongodb中。

最重要的当然代码是如何实现的啦。

代码中有详细的注释有兴趣的可以看看~

def main():
    startpage = eval(input('输入起始页码:'))
    endpage = eval(input('输入结束页码:'))
    # page 队列
    url_queue = Queue()
    # html 内容队列
    data_queue = Queue()
    spider = ZhiLianSpider(startpage, endpage, url_queue, data_queue)
    # 执行run方法返回一个url队列
    spider.run()
    # 创建生产者
    spider.create_producer()
    # 创建消费者
    spider.create_customer()
    # 阻塞.
    spider.wait_c()
    spider.wait_p()

class ZhiLianSpider(object):
    # 定义类属性 生产者 和消费者
    pname = ['生产者1号', '生产者2号', '生产者3号', '生产者4号']
    cname = ['消费者1号', '消费者2号', '消费者3号']

    def __init__(self, start, end, urlqueue, dataqueue):
        self.start = start
        self.end = end
        self.url = r'https://sou.zhaopin.com/?p={}&jl=489&kw=python&kt=3&sf=0&st=0'
        self.urlqueue = urlqueue
        self.dataqueue = dataqueue
        # 创建一个生产者线程列表 用于阻塞等待
        self.p_threadlst = []
        # 创建一个消费者线程列表 用于阻塞等待
        self.c_threadlst = []

    # run 方法执行返回完整页面的url
    def run(self) -> None:
        for page in range(self.start, self.end + 1):
            self.urlqueue.put(self.url.format(page))

    def create_producer(self):
        '''
        为了不使main函数中有太多冗余 将创建生产者和消费者放在这个类方法中
        :return:
        '''
        for name in self.pname:
            p = Producer(data_queue=self.dataqueue, url_queue=self.urlqueue, name=name)
            self.p_threadlst.append(p)
            # 启动线程
            p.start()

    def wait_p(self):
        for p in self.p_threadlst:
            p.join()

    def create_customer(self):
        for name in self.cname:
            c = Customer(self.dataqueue, name)
            self.c_threadlst.append(c)
            c.start()

    def wait_c(self):
        for c in self.c_threadlst:
            c.join()

生产者代码：

class Producer(threading.Thread):
    '''
    封装一下 chromedriver 无头浏览器参数
    '''
    options = Options()
    options.add_argument('--headless')
    options.add_argument('--disable-gpu')

    def __init__(self, data_queue, url_queue, name):
        super(Producer, self).__init__()
        self.data_queue = data_queue
        self.url_queue = url_queue
        self.name = name

    def run(self) -> None:
        '''
        这里run方法主要实现两个方法 ① 下载页面 ② 将页面存到data_queue队列中
        :return:  这里的 ->None 表示返回的是空
        '''
        while not self.url_queue.empty():
            url = self.url_queue.get()
            # 下载页面
            print('我是{}---->>>>正在下载页面{}'.format(self.name, url.split('?')[1].split('&')[0]))
            self.download_html(url)
            print('我是{}---->>>>已完成下载页面{}'.format(self.name, url.split('?')[1].split('&')[0]))

    def download_html(self, url):
        # 创建一个浏览器对象
        browser = webdriver.Chrome('../chromedriver.exe', options=self.options)
        # 打开url
        browser.get(url)
        # 等待1秒
        time.sleep(1)
        # 处理弹出的按钮
        button = browser.find_element_by_css_selector('body > div.a-modal.risk-warning > div > div > button')
        # 点击按钮
        button.click()
        browser.implicitly_wait(3)
        # 等待js内容渲染
        time.sleep(2)
        # 将页面源码存入队列中
        self.data_queue.put(browser.page_source)
        # 最后最后一定要记得关闭浏览器！ 因为这个函数是写在一个循环中的
        browser.quit()

消费者：

class Customer(threading.Thread):
    # 初始化mongodb参数
    # 连接服务器
    conn = MongoClient(host='localhost', port=27017)
    # 创建数据库
    db = conn.zhaopin
    # 创建集合
    collection = db.zhaopin_collection
    # 创建一个锁对象 当一个线程进行数据库的写入时 锁上 存储信息完毕后释放
    lock = threading.Lock()

    def __init__(self, data_queue, name):
        super(Customer, self).__init__()
        self.data_queue = data_queue
        self.name = name

    def run(self) -> None:
        while True:
            try:
                # 获取页面内容进行解析
                content = self.data_queue.get(True, 20)
                print('我是{},我正在解析...'.format(self.name))
                self.parse_content(content)

            except Exception:
                print('我是{},已经完成解析...'.format(self.name))
                break

    def parse_content(self, content):
        '''
        观察网站源码发现 所有的招聘内容放在了一个 div容器中 取出这个容器 循环遍历即可
        <div id="listContent" class="contentpile__content">
        :param content:
        :return:
        '''
        # 创建一个列表用于存储字典信息
        info_list = []
        soup = BeautifulSoup(content, 'lxml')
        div_lst = soup.find('div', id='listContent')
        for item in div_lst:
            try:
                # 岗位名称
                jobname = item.find('span', class_='contentpile__content__wrapper__item__info__box__jobname__title')[
                    'title']
                # 工资
                saray = item.find('p', class_='contentpile__content__wrapper__item__info__box__job__saray').text
                # 地区
                area = item.find_all('li', class_='contentpile__content__wrapper__item__info__box__job__demand__item')[
                    0].text
                # 经验
                ex = item.find_all('li', class_='contentpile__content__wrapper__item__info__box__job__demand__item')[
                         1].text.strip(), 
                     item.find_all('li', class_='contentpile__content__wrapper__item__info__box__job__demand__item')[
                         2].text.strip()
                # 公司名
                company_name = item.find('a',
                                         class_='contentpile__content__wrapper__item__info__box__cname__title company_title').text

                # 将信息存储为字典
                item_info = {
                    '岗位名称': jobname,
                    '工资': saray,
                    '地区': area,
                    '经验': ex,
                    '公司名': company_name
                }
                info_list.append(item_info)

            except Exception:
                continue

        # 写入数据库
        self.lock.acquire()
        self.collection.insert_many(info_list)
        self.lock.release()

最后的最后贴上我爬取的部分信息吧~

----------------------------------------------------------------------------------------------------------------------------------------------------------

PS：优化一下代码，看看之前写的，啧啧，辣眼睛~

# author:dayin
# Date:2019/12/17 0017
import time
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
from bs4 import BeautifulSoup
from pymongo import MongoClient
import threading
from queue import Queue

'''
需求： 输入起始页 和结束页 爬取智联招聘上python词条信息
       爬取的信息包括 就业岗位名称 薪资 地区  公司名称  需求{包括学历和经验}
       爬取的信息以字典形式保存到mongodb数据库中 
'''
# 创建一个锁对象 当一个线程进行数据库的写入时 锁上 存储信息完毕后释放
lock = threading.Lock()
# 创建一个同步条件,用于任务结束的标志
event = threading.Event()


class ZhiLianSpider(object):
    # 定义类属性 生产者 和消费者
    pname = ['生产者1号', '生产者2号', '生产者3号', '生产者4号']
    cname = ['消费者1号', '消费者2号', '消费者3号']

    def __init__(self, start, end, urlqueue, dataqueue):
        self.start = start
        self.end = end
        self.url = r'https://sou.zhaopin.com/?p={}&jl=489&kw=python&kt=3&sf=0&st=0'
        self.urlqueue = urlqueue
        self.dataqueue = dataqueue

    # run 方法执行返回完整页面的url
    def run(self) -> None:
        for page in range(self.start, self.end + 1):
            self.urlqueue.put(self.url.format(page))

    def create_producer(self):
        '''
        为了不使main函数中有太多冗余 将创建生产者和消费者放在这个类方法中
        :return:
        '''
        for name in self.pname:
            p = Producer(data_queue=self.dataqueue, url_queue=self.urlqueue, name=name)
            # 启动线程
            p.start()

    def create_customer(self):
        for name in self.cname:
            c = Customer(self.dataqueue, name)
            c.start()


class Producer(threading.Thread):
    '''
    封装一下 chromedriver 无头浏览器参数
    '''
    options = Options()
    options.add_argument('--headless')
    options.add_argument('--disable-gpu')

    def __init__(self, data_queue, url_queue, name):
        super(Producer, self).__init__()
        self.data_queue = data_queue
        self.url_queue = url_queue
        self.name = name

    def run(self) -> None:
        '''
        这里run方法主要实现两个方法 ① 下载页面 ② 将页面存到data_queue队列中
        :return:  这里的 ->None 表示返回的是空
        '''
        while True:
            if self.url_queue.empty():
                event.set()
                break
            url = self.url_queue.get()
            # 下载页面
            print('我是{}---->>>>正在下载页面{}'.format(self.name, url.split('?')[1].split('&')[0]))
            self.download_html(url)
            print('我是{}---->>>>已完成下载页面{}'.format(self.name, url.split('?')[1].split('&')[0]))

    def download_html(self, url):
        # 创建一个浏览器对象
        browser = webdriver.Chrome('chromedriver.exe', options=self.options)
        # 打开url
        browser.get(url)
        # 等待1秒
        time.sleep(1)
        # 处理弹出的按钮
        button = browser.find_element_by_css_selector('body > div.a-modal.risk-warning > div > div > button')
        # 点击按钮
        button.click()
        browser.implicitly_wait(3)
        # 等待js内容渲染
        time.sleep(2)
        # 将页面源码存入队列中
        self.data_queue.put(browser.page_source)
        # 最后最后一定要记得关闭浏览器！ 因为这个函数是写在一个循环中的
        browser.quit()


class Customer(threading.Thread):
    # 初始化mongodb参数
    # 连接服务器
    conn = MongoClient(host='192.168.43.115', port=27017)
    # 创建数据库
    db = conn.zhaopin
    # 创建集合
    collection = db.zhaopin_collection

    def __init__(self, data_queue: Queue, name):
        super(Customer, self).__init__()
        self.data_queue = data_queue
        self.name = name

    def run(self) -> None:
        while True:
            # 获取页面内容进行解析
            if self.data_queue.empty() and event.is_set():
                print('任务完成...')
                break
            content = self.data_queue.get()
            print('我是{},我正在解析...'.format(self.name))
            self.parse_content(content)
            print('我是{},已经完成解析...'.format(self.name))

    def parse_content(self, content):
        '''
        观察网站源码发现 所有的招聘内容放在了一个 div容器中 取出这个容器 循环遍历即可
        <div id="listContent" class="contentpile__content">
        :param content:
        :return:
        '''
        # 创建一个列表用于存储字典信息
        info_list = []
        soup = BeautifulSoup(content, 'lxml')
        div_lst = soup.find('div', id='listContent')
        try:
            for item in div_lst:
                try:
                    # 岗位名称
                    jobname = 
                        item.find('span', class_='contentpile__content__wrapper__item__info__box__jobname__title')[
                            'title']
                    # 工资
                    saray = item.find('p', class_='contentpile__content__wrapper__item__info__box__job__saray').text
                    # 地区
                    area = 
                        item.find_all('li', class_='contentpile__content__wrapper__item__info__box__job__demand__item')[
                            0].text
                    # 经验
                    ex = 
                        item.find_all('li', class_='contentpile__content__wrapper__item__info__box__job__demand__item')[
                            1].text.strip(), 
                        item.find_all('li', class_='contentpile__content__wrapper__item__info__box__job__demand__item')[
                            2].text.strip()
                    # 公司名
                    company_name = item.find('a',
                                             class_='contentpile__content__wrapper__item__info__box__cname__title company_title').text

                    # 将信息存储为字典
                    item_info = {
                        '岗位名称': jobname,
                        '工资': saray,
                        '地区': area,
                        '经验': ex,
                        '公司名': company_name
                    }
                    info_list.append(item_info)

                except Exception:
                    continue
        except Exception as e:
            print(e)
        # 写入数据库
        lock.acquire()
        self.collection.insert_many(info_list)
        lock.release()


def main():
    startpage = eval(input('输入起始页码:'))
    endpage = eval(input('输入结束页码:'))
    # page 队列
    url_queue = Queue()
    # html 内容队列
    data_queue = Queue()
    spider = ZhiLianSpider(startpage, endpage, url_queue, data_queue)
    # 执行run方法返回一个url队列
    spider.run()
    # 创建生产者
    spider.create_producer()
    # 创建消费者
    spider.create_customer()


if __name__ == '__main__':
    main()

版权声明：本文来源CSDN，感谢博主原创文章，遵循 CC 4.0 by-sa 版权协议，转载请附上原文出处链接和本声明。
原文链接：https://blog.csdn.net/weixin_42218582/article/details/90702969
站方申明：本站部分内容来自社区用户分享，若涉及侵权，请联系站方删除。

发表于 2020-04-18 21:28:35
阅读 ( 1948 )
分类：

python爬虫开发之“智联招聘”网页爬取

这里，我就使用究极方法 selenium + chromedriver 来实现。

你可能感兴趣的文章

精选的优质文章

0 条评论

官方社群

GO教程

推荐文章

猜你喜欢

随便看看