python爬取京东商品列表信息 - Go语言中文社区

python爬取京东商品列表信息


爬遍电商之京东篇:

目标是爬取指定商品的商品列表信息,包括商品名,价格,评论数,店铺名

打开京东页面,随便搜一个笔记本,F12打开NetWork开始抓包,翻个3页,遇到断点就按F8执行,然后看到第一个返回内容的ajax请求,是返回了第1页的后30个商品,下面开头名一样的依次返回第2页前30个,第2页后30个,第3页前30个,第三页后30个…别问是怎么知道的,对比一下就行了
在这里插入图片描述

看看第一个ajax是有哪些请求参数,通过跟下面几个对比发现,请求前30个商品和请求后30个商品,请求的参数有一点点不同,并且page、s、log_id都是会变化的

假设当前爬的是第N页
对于请求后30个商品的
page 都是偶数依次是2、4、6、8…
s=(N*2-1)*30+1
log_id就是15位的时间戳/100000

对于请求前30个商品的
page 都是奇数依次是1、3、5、7…
s=(N-1)*30+1
都是比较简单的找规律,齐活,开始撸代码
在这里插入图片描述
在这里插入图片描述

首先是获取前30个商品的代码,传入参数(查询的字段,当前爬取第几页),构造请求头和请求参数,后30页的代码类似

def get_first(shuru,i):
	page=2*i-1
	s = (i  - 1) * 30 + 1

	he = {
		'accept': '*/*',
		'accept-encoding': 'gzip, deflate, br',
		'accept-language': 'zh-CN,zh;q=0.9',
		'cookie': '__jdu=783078358; areaId=15; ipLoc-djd=15-1213-3038-0; shshshfpa=25d2efdd-812b-00bf-0465-bc601a32664e-1572142042; xtest=7667.cf6b6759; shshshfpb=y%2F%201kbkJW0rrCZxHU6os3WA%3D%3D; user-key=56e092e2-0b10-4615-9bc1-dd211435cb26; cn=0; qrsc=3; unpl=V2_ZzNtbUIDRhRzCBIEexhdUmIBFAhKUBNGJQ1DVikcVFY3CxVcclRCFX0URlVnGlQUZwcZXUJcRhxFCEdkeB5fA2AFEFlBZxBFLV0CFi9JH1c%2bbRJcRV5CE3cPRVB7Gmw1ZAMiXUNnRRx3CUBdeR1VNVcEIm1yUUATcAtCZHopXTUlV05eRV5LFXFFQF15GFoMZQcbbUNnQA%3d%3d; __jdv=76161171|baidu-search|t_262767352_baidusearch|cpc|106807362512_0_1e4071ea100f437d96aba443c49ba960|1572333108335; __jda=122270672.783078358.1564988642.1572328729.1572333108.5; __jdc=122270672; __jdb=122270672.3.783078358|5.1572333108; shshshfp=f71d3f04ca730a97469ed0ded5889260; shshshsID=3394e8dd5a61826ebe3e3b39c51a7b35_2_1572333115068; rkv=V0000; 3AB9D23F7A4B3C9B=JDBODRIZ2EQH56E43CTTKRIEK74SLK6SDCZN2MDTPNMDAHNJBPOF7RIORAQB4F75VW3UNR635OBECG4L3P24AWIE2U',
		'referer': 'https://search.jd.com/Search?keyword=%E7%AC%94%E8%AE%B0%E6%9C%AC&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq=%E7%AC%94%E8%AE%B0%E6%9C%AC&page='+str(page)+'&s='+str(s)+'&click=0',
		'sec-fetch-mode': 'cors',
		'sec-fetch-site': 'same-origin',
		'x-requested-with': 'XMLHttpRequest',
		'user-agent': ka.random

	}

	data = {
		'keyword': shuru,
		'enc': 'utf-8',
		'qrst': 1,
		'rt': 1,
		'stop': 1,
		'vt': 2,
		'wq': shuru,
		'page': page,
		's': s,
		'click':0
	}
	url = 'https://search.jd.com/s_new.php?'
	res = requests.get(url + urlencode(data), headers=he,timeout=5)
	res.encoding='utf-8'
	source = etree.HTML(res.text)
	title_list=source.xpath('//li[@class="gl-item"]')
	for title in title_list:
		tt=title.xpath('./div[@class="gl-i-wrap"]//div[@class="p-name p-name-type-2"]/a/em/text()')
		print(tt)

完整代码如下,2个方法就行,具体想获取的信息自行补充奥

# -*- coding: utf-8 -*-
import os
import re
import time
from urllib.parse import urlencode
import fake_useragent
import requests
from lxml import etree

# 方法二,从本地文件夹获取
location = os.getcwd() + 'headers.csv'
ka = fake_useragent.UserAgent(path=location, verify_ssl=False, use_cache_server=False)


def get_first(shuru,i):
	page=2*i-1
	s = (i  - 1) * 30 + 1

	he = {
		'accept': '*/*',
		'accept-encoding': 'gzip, deflate, br',
		'accept-language': 'zh-CN,zh;q=0.9',
		'cookie': '__jdu=783078358; areaId=15; ipLoc-djd=15-1213-3038-0; shshshfpa=25d2efdd-812b-00bf-0465-bc601a32664e-1572142042; xtest=7667.cf6b6759; shshshfpb=y%2F%201kbkJW0rrCZxHU6os3WA%3D%3D; user-key=56e092e2-0b10-4615-9bc1-dd211435cb26; cn=0; qrsc=3; unpl=V2_ZzNtbUIDRhRzCBIEexhdUmIBFAhKUBNGJQ1DVikcVFY3CxVcclRCFX0URlVnGlQUZwcZXUJcRhxFCEdkeB5fA2AFEFlBZxBFLV0CFi9JH1c%2bbRJcRV5CE3cPRVB7Gmw1ZAMiXUNnRRx3CUBdeR1VNVcEIm1yUUATcAtCZHopXTUlV05eRV5LFXFFQF15GFoMZQcbbUNnQA%3d%3d; __jdv=76161171|baidu-search|t_262767352_baidusearch|cpc|106807362512_0_1e4071ea100f437d96aba443c49ba960|1572333108335; __jda=122270672.783078358.1564988642.1572328729.1572333108.5; __jdc=122270672; __jdb=122270672.3.783078358|5.1572333108; shshshfp=f71d3f04ca730a97469ed0ded5889260; shshshsID=3394e8dd5a61826ebe3e3b39c51a7b35_2_1572333115068; rkv=V0000; 3AB9D23F7A4B3C9B=JDBODRIZ2EQH56E43CTTKRIEK74SLK6SDCZN2MDTPNMDAHNJBPOF7RIORAQB4F75VW3UNR635OBECG4L3P24AWIE2U',
		'referer': 'https://search.jd.com/Search?keyword=%E7%AC%94%E8%AE%B0%E6%9C%AC&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq=%E7%AC%94%E8%AE%B0%E6%9C%AC&page='+str(page)+'&s='+str(s)+'&click=0',
		'sec-fetch-mode': 'cors',
		'sec-fetch-site': 'same-origin',
		'x-requested-with': 'XMLHttpRequest',
		'user-agent': ka.random

	}

	data = {
		'keyword': shuru,
		'enc': 'utf-8',
		'qrst': 1,
		'rt': 1,
		'stop': 1,
		'vt': 2,
		'wq': shuru,
		'page': page,
		's': s,
		'click':0
	}
	url = 'https://search.jd.com/s_new.php?'
	res = requests.get(url + urlencode(data), headers=he,timeout=5)
	res.encoding='utf-8'
	source = etree.HTML(res.text)
	title_list=source.xpath('//li[@class="gl-item"]')
	for title in title_list:
		tt=title.xpath('./div[@class="gl-i-wrap"]//div[@class="p-name p-name-type-2"]/a/em/text()')
		print(tt)




def get_laterpage(shuru,i):
	s = (i * 2 - 1) * 30 + 1
	page = i * 2

	he = {
		'accept': '*/*',
		'accept-encoding': 'gzip, deflate, br',
		'accept-language': 'zh-CN,zh;q=0.9',
		'cookie': '__jdu=783078358; areaId=15; ipLoc-djd=15-1213-3038-0; shshshfpa=25d2efdd-812b-00bf-0465-bc601a32664e-1572142042; xtest=7667.cf6b6759; shshshfpb=y%2F%201kbkJW0rrCZxHU6os3WA%3D%3D; user-key=56e092e2-0b10-4615-9bc1-dd211435cb26; cn=0; qrsc=3; unpl=V2_ZzNtbUIDRhRzCBIEexhdUmIBFAhKUBNGJQ1DVikcVFY3CxVcclRCFX0URlVnGlQUZwcZXUJcRhxFCEdkeB5fA2AFEFlBZxBFLV0CFi9JH1c%2bbRJcRV5CE3cPRVB7Gmw1ZAMiXUNnRRx3CUBdeR1VNVcEIm1yUUATcAtCZHopXTUlV05eRV5LFXFFQF15GFoMZQcbbUNnQA%3d%3d; __jdv=76161171|baidu-search|t_262767352_baidusearch|cpc|106807362512_0_1e4071ea100f437d96aba443c49ba960|1572333108335; __jda=122270672.783078358.1564988642.1572328729.1572333108.5; __jdc=122270672; __jdb=122270672.3.783078358|5.1572333108; shshshfp=f71d3f04ca730a97469ed0ded5889260; shshshsID=3394e8dd5a61826ebe3e3b39c51a7b35_2_1572333115068; rkv=V0000; 3AB9D23F7A4B3C9B=JDBODRIZ2EQH56E43CTTKRIEK74SLK6SDCZN2MDTPNMDAHNJBPOF7RIORAQB4F75VW3UNR635OBECG4L3P24AWIE2U',
		'referer': 'https://search.jd.com/Search?keyword=%E7%AC%94%E8%AE%B0%E6%9C%AC&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq=%E7%AC%94%E8%AE%B0%E6%9C%AC&page='+str(page)+'&s='+str(s)+'&click=0',
		'sec-fetch-mode': 'cors',
		'sec-fetch-site': 'same-origin',
		'x-requested-with': 'XMLHttpRequest',
		'user-agent': ka.random

	}

	data = {
		'keyword': shuru,
		'enc': 'utf-8',
		'qrst': 1,
		'rt': 1,
		'stop': 1,
		'vt': 2,
		'wq': shuru,
		'page': page,
		's': s,
		'scrolling': 'y',
		'log_id': int(time.time() * 100000) / 100000,
		'tpl': '1_M'
	}
	url = 'https://search.jd.com/s_new.php?'
	res = requests.get(url + urlencode(data), headers=he,timeout=5)
	res.encoding='utf-8'
	source = etree.HTML(res.text)
	title_list=source.xpath('//li[@class="gl-item"]')
	for title in title_list:
		tt=title.xpath('./div[@class="gl-i-wrap"]//div[@class="p-name p-name-type-2"]/a/em/text()')
		print(tt)







if __name__ == '__main__':
	shuru='笔记本'
	s=1
	for i in range(1,5):

		get_first(shuru,i)
		print('---------------------------')
		time.sleep(2)
		get_laterpage(shuru,i)
		print('第%s页结束'%i)
		time.sleep(2)

我只获取了商品标题,最后是这样的
在这里插入图片描述

版权声明:本文来源CSDN,感谢博主原创文章,遵循 CC 4.0 by-sa 版权协议,转载请附上原文出处链接和本声明。
原文链接:https://blog.csdn.net/weixin_42195144/article/details/102813317
站方申明:本站部分内容来自社区用户分享,若涉及侵权,请联系站方删除。

0 条评论

请先 登录 后评论

官方社群

GO教程

猜你喜欢