python爬虫之新浪网（简洁版）

新浪新闻
爬虫
python

注释挺详细了，直接上全部代码，欢迎各位大佬批评指正。

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from time import sleep
from lxml import etree
import os
import requests
import csv

# 创建一个无头浏览器对象
chrome_options = Options()
# 设置它为无框模式
chrome_options.add_argument('--headless')
# 如果在windows上运行需要加代码
chrome_options.add_argument('--disable-gpu')
browser = webdriver.Chrome(chrome_options=chrome_options)
# 设置一个10秒的隐式等待
browser.implicitly_wait(10)
# 使用谷歌无头浏览器来加载动态js
def start_get(url):
    try:
        browser.get(url)
        
        a = []
        for one in range(1, 100):
            sleep(0.5)
            # 翻到页底
            browser.execute_script('window.scrollTo(0,document.body.scrollHeight)')
            sleep(0.5)
            # 再次翻页到底
            browser.execute_script('window.scrollTo(0,document.body.scrollHeight)')
            sleep(1)
            # 拿到页面源代码
            source = browser.page_source
            a.append(source)
            print(f"{one}源码获取成功")
            next_btn = browser.find_elements_by_xpath('.//span[@class="pagebox_next"]')
            if next_btn:
                next_btn[0].click()
        return a
    except Exception as e:
        print(e)
        
# 对新闻列表页面进行解析
def parse_page(html):
    # 创建etree对象
    tree = etree.HTML(html)
    new_lst = tree.xpath('//div[@id="subShowContent1_static"]') # 注意修改
    for one_new in new_lst:
        title = one_new.xpath('.//h2/a/text()')[0]
        link = one_new.xpath('.//h2/a/@href')[0]
        write_in(title, link)

# 将其写入到文件
def write_in(title, link):
    alist = []
    print('开始写入篇新闻{}'.format(title))
    browser.get(link)
    sleep(1)
    source = browser.page_source
    tree = etree.HTML(source)
    alist.append(title)

    con_link = link
    alist.append(con_link)

    content_lst = tree.xpath('.//div[@class="article"]//p')
    con = ''
    for one_content in content_lst:
        if one_content.text:
            con = con + 'n' + one_content.text.strip()
    alist.append(con)


    post_time = tree.xpath('.//span[@class="date"]')[0].text
    alist.append(post_time)


    post_source = tree.xpath('.//a[@class="source"]')[0].text
    alist.append(post_source)

    # browser.get(url)
    tiecount = tree.xpath('.//a[@data-sudaclick="comment_sum_p"]')[0].text
    alist.append(tiecount)

    tiejoincount = tree.xpath('.//a[@data-sudaclick="comment_participatesum_p"]')[0].text
    alist.append(tiejoincount)

    # 1. 创建文件对象
    f = open('新浪.csv', 'a+', encoding='utf-8',newline='')
    # 2. 基于文件对象构建 csv写入对象
    csv_writer = csv.writer(f)
    # print(alist)
    csv_writer.writerow(alist)
    f.close()
if __name__ == '__main__':
    urls = ['https://news.163.com/domestic/','https://news.163.com/world/','https://war.163.com/','https://money.163.com/','https://tech.163.com/']
	for url in urls:
		html_list = start_get(url)
	    for h in html_list:
	        try:
	            parse_page(h)
	
	        except Exception as e:
	            print(e)

结果如下：

注：本文仅用于技术交流，不得用于商业用途。不遵守者，与本文作者无关。

版权声明：本文来源CSDN，感谢博主原创文章，遵循 CC 4.0 by-sa 版权协议，转载请附上原文出处链接和本声明。
原文链接：https://blog.csdn.net/qq_38112817/article/details/104627874
站方申明：本站部分内容来自社区用户分享，若涉及侵权，请联系站方删除。

发表于 2020-04-18 21:23:41
阅读 ( 886 )
分类：

python爬虫之新浪网（简洁版）

你可能感兴趣的文章

精选的优质文章

0 条评论

官方社群

GO教程

推荐文章

猜你喜欢

随便看看