社区微信群开通啦,扫一扫抢先加入社区官方微信群
社区微信群
这里只解析一下代码,所需工具jsoup、HttpClient
httpCLient获取html后,用jsoup解析html,再用java来获取所需要的信息。
之前写的有点问题,今天改了一下。因为通过hash值拼接的地址是个临时地址,存在数据库后一天就失效了,所以我改了一下。先把爬到的歌曲下载到本地,然后上传到七牛云的对象存储空间,再返回这个地址。最后把七牛云上的地址存到数据库中,这样就是永久的了。
HTTPManage:
package com.after.demo.spider;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
/**
* @author www.xyjz123.xyz
* @description
* @date 2019/4/19 19:06
*/
public class HtmlManage {
public Document manage(String html){
Document doc = Jsoup.parse(html);
return doc;
}
}
HttpGetConnect:
package com.after.demo.spider;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.http.HttpEntity;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.BasicHttpClientConnectionManager;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
/**
* @author www.xyjz123.xyz
* @description
* @date 2019/4/19 19:05
*/
public class HttpGetConnect {
/**
* 获取html内容
* @param url
* @param charsetName UTF-8、GB2312
* @return
* @throws IOException
*/
public static String connect(String url,String charsetName) throws IOException{
BasicHttpClientConnectionManager connManager = new BasicHttpClientConnectionManager();
CloseableHttpClient httpclient = HttpClients.custom()
.setConnectionManager(connManager)
.build();
String content = "";
try{
HttpGet httpget = new HttpGet(url);
RequestConfig requestConfig = RequestConfig.custom()
.setSocketTimeout(5000)
.setConnectTimeout(50000)
.setConnectionRequestTimeout(50000)
.build();
httpget.setConfig(requestConfig);
httpget.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");
httpget.setHeader("Accept-Encoding", "gzip,deflate,sdch");
httpget.setHeader("Accept-Language", "zh-CN,zh;q=0.8");
httpget.setHeader("Connection", "keep-alive");
httpget.setHeader("Upgrade-Insecure-Requests", "1");
httpget.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36");
httpget.setHeader("cache-control", "max-age=0");
CloseableHttpResponse response = httpclient.execute(httpget);
int status = response.getStatusLine().getStatusCode();
if (status >= 200 && status < 300) {
HttpEntity entity = response.getEntity();
InputStream instream = entity.getContent();
BufferedReader br = new BufferedReader(new InputStreamReader(instream,charsetName));
StringBuffer sbf = new StringBuffer();
String line = null;
while ((line = br.readLine()) != null){
sbf.append(line + "n");
}
br.close();
content = sbf.toString();
} else {
content = "";
}
}catch(Exception e){
e.printStackTrace();
}finally{
httpclient.close();
}
//log.info("content is " + content);
return content;
}
private static Log log = LogFactory.getLog(HttpGetConnect.class);
}
MusicController:
package com.after.demo.controller;
import com.after.demo.entity.Music;
import com.after.demo.service.impl.MusicServiceImpl;
import com.after.demo.service.impl.UploadServiceImpl;
import com.after.demo.spider.FileDownload;
import com.after.demo.spider.HtmlManage;
import com.after.demo.spider.HttpGetConnect;
import com.after.demo.utils.GetString;
import com.after.demo.utils.JsonResult;
import com.google.gson.Gson;
import com.qiniu.common.QiniuException;
import com.qiniu.http.Response;
import com.qiniu.storage.model.DefaultPutRet;
import io.swagger.annotations.ApiOperation;
import net.sf.json.JSONObject;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.web.bind.annotation.GetMapping;
import org.springframework.web.bind.annotation.PostMapping;
import org.springframework.web.bind.annotation.RestController;
import java.io.File;
import java.io.IOException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* @author www.xyjz123.xyz
* @description
* @date 2019/4/19 21:18
*/
@RestController
public class MusicController {
@Autowired
MusicServiceImpl musicService;
@Autowired
UploadServiceImpl uploadService;
public static String FILEPATH = "F:/music/";
public static String mp3 = "https://wwwapi.kugou.com/yy/index.php?r=play/getdata&callback=jQuery191027067069941080546_1546235744250&"
+ "hash=HASH&album_id=0&_=TIME";
public static final String LINK = "https://www.kugou.com/yy/rank/home/PAGE-33164.html?from=rank";
@GetMapping("/music/save")
@ApiOperation("将酷狗歌单爬取存入数据库")
public JsonResult saveMusic() throws IOException{
for(int i = 1 ; i < 10 ; i++){
String url = LINK.replace("PAGE", i + "");
getTitle(url);
}
return JsonResult.ok();
}
@PostMapping("/music/getOne")
@ApiOperation("随机获取一首歌")
public JsonResult getMusic(){
int id = GetString.getId();
Music music = musicService.getMusicById(id);
return JsonResult.ok(music);
}
public String getTitle(String url) throws IOException {
String content = HttpGetConnect.connect(url, "utf-8");
HtmlManage html = new HtmlManage();
Document doc = html.manage(content);
Element ele = doc.getElementsByClass("pc_temp_songlist").get(0);
Elements eles = ele.getElementsByTag("li");
for(int i = 0 ; i < eles.size() ; i++){
Element item = eles.get(i);
String title = item.attr("title").trim();
String link = item.getElementsByTag("a").first().attr("href");
download(link,title);
}
return null;
}
public String download(String url,String name) throws IOException{
String hash = "";
String content = HttpGetConnect.connect(url, "utf-8");
HtmlManage html = new HtmlManage();
String regEx = ""hash":"[0-9A-Z]+"";
// 编译正则表达式
Pattern pattern = Pattern.compile(regEx);
Matcher matcher = pattern.matcher(content);
if (matcher.find()) {
hash = matcher.group();
hash = hash.replace(""hash":"", "");
hash = hash.replace(""", "");
}
//爬取歌曲的封面图
Document doc = html.manage(content);
Element ele = doc.getElementsByClass("albumImg").get(0);
String imgUrl = ele.getElementsByTag("img").attr("src");
//利用hash值构造歌曲mp3地址
String item = mp3.replace("HASH", hash);
item = item.replace("TIME", System.currentTimeMillis() + "");
String mp = HttpGetConnect.connect(item, "utf-8");
mp = mp.substring(mp.indexOf("(") + 1, mp.length() - 3);
JSONObject json = JSONObject.fromObject(mp);
String playUrl = json.getJSONObject("data").getString("play_url");
System.out.println(playUrl);
FileDownload fileDownload = new FileDownload();
fileDownload.download(playUrl,FILEPATH + name + ".mp3");
String src = null;
try{
File file = new File(FILEPATH + name + ".mp3");
Response response = uploadService.uploadFile(file);
//解析上传成功的结果
DefaultPutRet putRet = new Gson().fromJson(response.bodyString(), DefaultPutRet.class);
src = "http://www.jie12366.xyz/" + putRet.key;
}catch (QiniuException e){
e.printStackTrace();
}
//如果图片地址或mp3地址为空,则不爬取(歌曲是收费的无法爬取)
if (StringUtils.isNotBlank(src) && StringUtils.isNotBlank(imgUrl)){
musicService.saveMusic(name,imgUrl,src);
}
return playUrl;
}
}
新增FileDownload类:
package com.after.demo.spider;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;
/**
* @author www.xyjz123.xyz
* @description
* @date 2019/4/24 16:37
*/
public class FileDownload {
/**
* 文件下载
* @param url 链接地址
* @param path 要保存的路径及文件名
* @return
*/
public void download(String url,String path){
CloseableHttpClient httpclient = HttpClients.createDefault();
RequestConfig requestConfig = RequestConfig.custom().setSocketTimeout(2000)
.setConnectTimeout(2000).build();
HttpGet get = new HttpGet(url);
get.setConfig(requestConfig);
BufferedInputStream in = null;
BufferedOutputStream out = null;
try{
for(int i=0;i < 3;i++){
CloseableHttpResponse result = httpclient.execute(get);
if(result.getStatusLine().getStatusCode() == 200){
in = new BufferedInputStream(result.getEntity().getContent());
File file = new File(path);
out = new BufferedOutputStream(new FileOutputStream(file));
byte[] buffer = new byte[1024];
int len = -1;
while((len = in.read(buffer,0,1024)) > -1){
out.write(buffer,0,len);
}
break;
}else if(result.getStatusLine().getStatusCode() == 500){
continue ;
}
}
}catch(Exception e){
e.printStackTrace();
}finally{
get.releaseConnection();
try{
if(in != null){
in.close();
}
if(out != null){
out.close();
}
}catch(Exception e){
e.printStackTrace();
}
}
}
}
entiry:
package com.after.demo.entity;
import com.gitee.sunchenbin.mybatis.actable.annotation.Column;
import com.gitee.sunchenbin.mybatis.actable.annotation.Table;
import com.gitee.sunchenbin.mybatis.actable.constants.MySqlTypeConstant;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
/**
* @author www.xyjz123.xyz
* @description
* @date 2019/4/19 19:20
*/
@Data
@AllArgsConstructor
@NoArgsConstructor
@Table(name = "music")
public class Music {
@Column(name = "id",type = MySqlTypeConstant.INT,isKey = true,isAutoIncrement = true,length = 5)
private int id;
@Column(name = "name",type = MySqlTypeConstant.VARCHAR,isUnique = true)
private String name;
@Column(name = "imgUrl",type = MySqlTypeConstant.VARCHAR,length = 80)
private String imgUrl;
@Column(name = "src",type = MySqlTypeConstant.VARCHAR)
private String src;
}
mapper:
package com.after.demo.mapper;
import com.after.demo.entity.Music;
import org.apache.ibatis.annotations.Insert;
import org.apache.ibatis.annotations.Mapper;
import org.apache.ibatis.annotations.Select;
import java.util.List;
/**
* @author www.xyjz123.xyz
* @description
* @date 2019/4/19 21:03
*/
@Mapper
public interface MusicMapper {
/**
* 将爬取的歌曲信息存入数据库
* @param name 歌曲名
* @param imgUrl 歌曲封面
* @param src 歌曲地址
* @return 是否成功
*/
@Insert("insert into music(name,imgUrl,src) values(#{name},#{imgUrl},#{src})")
int saveMusic(String name,String imgUrl,String src);
/**
* 获取数据库中的歌曲信息
* @return list
*/
@Select("select * from music")
List<Music> listMusic();
/**
* 根据id随机获取一首歌
* @param id int
* @return Music
*/
@Select("select * from music where id=#{id}")
Music getMusicById(int id);
}
srerivce:
package com.after.demo.service.impl;
import com.after.demo.entity.Music;
import com.after.demo.mapper.MusicMapper;
import com.after.demo.service.MusicService;
import com.after.demo.utils.GetString;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.cache.annotation.CacheConfig;
import org.springframework.cache.annotation.Cacheable;
import org.springframework.stereotype.Service;
import java.util.List;
/**
* @author www.xyjz123.xyz
* @description
* @date 2019/4/19 21:09
*/
@Service
@CacheConfig
public class MusicServiceImpl implements MusicService {
@Autowired
MusicMapper musicMapper;
@Override
public int saveMusic(String name, String imgUrl, String src) {
List<Music> musicList = musicMapper.listMusic();
for (Music music:musicList){
if (music.getName().equals(name)){
return 0;
}
}
return musicMapper.saveMusic(name,imgUrl,src);
}
@Override
@Cacheable(value = "music")
public List<Music> listMusic() {
return musicMapper.listMusic();
}
@Override
public Music getMusicById(int id) {
int maxSize = GetString.MAXSIZE;
if (id <= maxSize){
return musicMapper.getMusicById(id);
}
return null;
}
}
utils:
package com.after.demo.utils;
/**
* @author 熊义杰
* @date 2019-3-16
*/
public class GetString {
public static final int MAXSIZE = 165;
public static int getId(){
int id = (int)(Math.random() * MAXSIZE);
return id;
}
}
数据库效果:
随机获取一首歌:
如果觉得我的文章对您有用,请随意打赏。你的支持将鼓励我继续创作!