社区微信群开通啦,扫一扫抢先加入社区官方微信群
社区微信群
url = 'https://www.ptpress.com.cn/shopping/index'
from selenium import webdriver #导入库
import time
driver = webdriver.Chrome() #声明浏览器
driver.get(url) #请求网页
time.sleep(10)
input_button = driver.find_element_by_css_selector('div.allSearch > input') #定位输入框
input_button.send_keys('python') #给输入框输入内容
time.sleep(4)
input_click = driver.find_element_by_css_selector('div.allSearch > a > i') #定位输入框
input_click.click() #给输入框输入内容
time.sleep(20)
windows = driver.window_handles #把新的页面赋值给windows
driver.switch_to.window(windows[-1]) #把窗口windwos中的最后一个窗口为当前窗口
html = driver.page_source #提取网页源码
print(html)
from lxml import etree
e_html = etree.HTML(html)
c_url = e_html.xpath('//li[@class="item"]/a/@href')
#print(c_url) #找图书url
c_url_one = 'https://www.ptpress.com.cn'+c_url[0]
print(c_url_one)
driver.get(c_url_one)
time.sleep(10)
c_html = driver.page_source
print(c_html)
#print(c_html)
child_e_html = etree.HTML(c_html)
price = child_e_html.xpath('//span[@class="price-p"]/span/text()')
print(price[0])
author = child_e_html.xpath('//p[contains(@class,"book-author")]/text()')
print(author)
kucun = child_e_html.xpath('//div[@class="key_attr"]/dl[1]/dd/text()')
print(kucun)
info = child_e_html.xpath('//div[@class="mobile-con"]/p/text()')
print(info)
以爬取豆瓣电影《流浪地球》短热评为例:
from lxml import etree
import json
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
#from selenium.webdriver.support import excepted_condition as EC
import time
import csv
url = 'https://www.douban.com/'
ua = {'User-Agent':'User-Agent:Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1'}
driver = webdriver.Chrome() #声明浏览器对象
driver.get(url) #请求网页
print(driver) #打印浏览器对象
driver.maximize_window()
driver.switch_to_frame(driver.find_element_by_tag_name('iframe'))
html = driver.page_source
print(html)
# In[61]:
#点击密码登录
login_with_password = driver.find_element_by_xpath('/html/body/div[1]/div[1]/ul[1]/li[2]')
login_with_password.click()
#输入账号
username = driver.find_element_by_xpath('//*[@id="username"]')
username.send_keys('your account')
#输入密码
password = driver.find_element_by_xpath('//*[@id="password"]')
password.send_keys('your password')
time.sleep(2)
#点击登录
submit = driver.find_element_by_xpath('/html/body/div[1]/div[2]/div[1]/div[5]/a')
submit.click()
#切换页面
windows = driver.window_handles #把新的页面赋值给windows
driver.switch_to.window(windows[-1]) #把窗口windwos中的最后一个窗口为当前窗口
time.sleep(10)
# In[62]:
#搜索
search_input = driver.find_element_by_xpath('//*[@id="inp-query"]')
search_input.send_keys('流浪地球')
time.sleep(2)
search_button = driver.find_element_by_xpath('//*[@id="db-nav-sns"]/div/div/div[2]/form/fieldset/div[2]/input')
search_button.click()
#进入搜索结果第一条的详细页面
item = driver.find_element_by_xpath('//div[@class="result"]//div[@class="title"]/h3/a')
item.click()
# In[65]:
#切换页面
windows = driver.window_handles #把新的页面赋值给windows
driver.switch_to.window(windows[-1]) #把窗口windwos中的最后一个窗口为当前窗口
#把当前页面的源码给html
html = driver.page_source #提取网页源码
print(html)
# In[66]:
#使用xpth解析网页
e_html = etree.HTML(html)
film_name = e_html.xpath('//*[@id="content"]/h1/span[1]/text()') #电影名称
print(film_name)
year = e_html.xpath('//*[@id="content"]/h1/span[2]/text()') #年份
print(year)
director = e_html.xpath('//*[@id="info"]/span[1]/span[2]/a/text()') #导演
print(director)
types = e_html.xpath('//*[@id="info"]/span[5]/text()') #类型
print(types)
score = e_html.xpath('//*[@id="interest_sectl"]/div[1]/div[2]/strong/text()') #豆瓣评分
print(score)
hot_comments =driver.find_element_by_xpath('//*[@id="hot-comments"]/a') #热门短评
hot_comments.click()
# In[67]:
#把当前页面的源码给html
html = driver.page_source #提取网页源码
#print(html)
#使用xpth解析网页
e_html = etree.HTML(html)
commits = e_html.xpath('//*[@id="comments"]/div/div[2]/p/span/text()') #短评
#print(commits)
#存入文件
with open(r'C:UsersHPDesktop{}.txt'.format(film_name[0].strip()),'a',encoding='utf-8') as df:
for one in commits:
df.write(json.dumps(one,ensure_ascii=False)+'nn')
如果觉得我的文章对您有用,请随意打赏。你的支持将鼓励我继续创作!