java爬取酷狗榜单歌曲信息并存入数据库 - Go语言中文社区

java爬取酷狗榜单歌曲信息并存入数据库


这里只解析一下代码,所需工具jsoup、HttpClient
httpCLient获取html后,用jsoup解析html,再用java来获取所需要的信息。
之前写的有点问题,今天改了一下。因为通过hash值拼接的地址是个临时地址,存在数据库后一天就失效了,所以我改了一下。先把爬到的歌曲下载到本地,然后上传到七牛云的对象存储空间,再返回这个地址。最后把七牛云上的地址存到数据库中,这样就是永久的了。

HTTPManage:

package com.after.demo.spider;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

/**
 * @author www.xyjz123.xyz
 * @description
 * @date 2019/4/19 19:06
 */
public class HtmlManage {

    public Document manage(String html){
        Document doc = Jsoup.parse(html);
        return doc;
    }
}

HttpGetConnect:

package com.after.demo.spider;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.http.HttpEntity;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.BasicHttpClientConnectionManager;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
/**
 * @author www.xyjz123.xyz
 * @description
 * @date 2019/4/19 19:05
 */
public class HttpGetConnect {

    /**
     *  获取html内容
     * @param url
     * @param charsetName  UTF-8、GB2312
     * @return
     * @throws IOException
     */
    public static String connect(String url,String charsetName) throws IOException{
        BasicHttpClientConnectionManager connManager = new BasicHttpClientConnectionManager();

        CloseableHttpClient httpclient = HttpClients.custom()
                .setConnectionManager(connManager)
                .build();
        String content = "";

        try{
            HttpGet httpget = new HttpGet(url);

            RequestConfig requestConfig = RequestConfig.custom()
                    .setSocketTimeout(5000)
                    .setConnectTimeout(50000)
                    .setConnectionRequestTimeout(50000)
                    .build();
            httpget.setConfig(requestConfig);
            httpget.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");
            httpget.setHeader("Accept-Encoding", "gzip,deflate,sdch");
            httpget.setHeader("Accept-Language", "zh-CN,zh;q=0.8");
            httpget.setHeader("Connection", "keep-alive");
            httpget.setHeader("Upgrade-Insecure-Requests", "1");
            httpget.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36");
            httpget.setHeader("cache-control", "max-age=0");

            CloseableHttpResponse response = httpclient.execute(httpget);

            int status = response.getStatusLine().getStatusCode();
            if (status >= 200 && status < 300) {

                HttpEntity entity = response.getEntity();
                InputStream instream = entity.getContent();
                BufferedReader br = new BufferedReader(new InputStreamReader(instream,charsetName));
                StringBuffer sbf = new StringBuffer();
                String line = null;
                while ((line = br.readLine()) != null){
                    sbf.append(line + "n");
                }

                br.close();
                content = sbf.toString();
            } else {
                content = "";
            }

        }catch(Exception e){
            e.printStackTrace();
        }finally{
            httpclient.close();
        }
        //log.info("content is " + content);
        return content;
    }
    private static Log log = LogFactory.getLog(HttpGetConnect.class);
}

MusicController:

package com.after.demo.controller;

import com.after.demo.entity.Music;
import com.after.demo.service.impl.MusicServiceImpl;
import com.after.demo.service.impl.UploadServiceImpl;
import com.after.demo.spider.FileDownload;
import com.after.demo.spider.HtmlManage;
import com.after.demo.spider.HttpGetConnect;
import com.after.demo.utils.GetString;
import com.after.demo.utils.JsonResult;
import com.google.gson.Gson;
import com.qiniu.common.QiniuException;
import com.qiniu.http.Response;
import com.qiniu.storage.model.DefaultPutRet;
import io.swagger.annotations.ApiOperation;
import net.sf.json.JSONObject;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.web.bind.annotation.GetMapping;
import org.springframework.web.bind.annotation.PostMapping;
import org.springframework.web.bind.annotation.RestController;

import java.io.File;
import java.io.IOException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * @author www.xyjz123.xyz
 * @description
 * @date 2019/4/19 21:18
 */
@RestController
public class MusicController {

    @Autowired
    MusicServiceImpl musicService;
    @Autowired
    UploadServiceImpl uploadService;

    public static String FILEPATH = "F:/music/";
    public static String mp3 = "https://wwwapi.kugou.com/yy/index.php?r=play/getdata&callback=jQuery191027067069941080546_1546235744250&"
            + "hash=HASH&album_id=0&_=TIME";

    public static final String LINK = "https://www.kugou.com/yy/rank/home/PAGE-33164.html?from=rank";

    @GetMapping("/music/save")
    @ApiOperation("将酷狗歌单爬取存入数据库")
    public JsonResult saveMusic() throws IOException{
        for(int i = 1 ; i < 10 ; i++){
            String url = LINK.replace("PAGE", i + "");
            getTitle(url);
        }
        return JsonResult.ok();
    }

    @PostMapping("/music/getOne")
    @ApiOperation("随机获取一首歌")
    public JsonResult getMusic(){
        int id = GetString.getId();
        Music music = musicService.getMusicById(id);
        return JsonResult.ok(music);
    }

    public String getTitle(String url) throws IOException {
        String content = HttpGetConnect.connect(url, "utf-8");
        HtmlManage html = new HtmlManage();
        Document doc = html.manage(content);
        Element ele = doc.getElementsByClass("pc_temp_songlist").get(0);
        Elements eles = ele.getElementsByTag("li");
        for(int i = 0 ; i < eles.size() ; i++){
            Element item = eles.get(i);
            String title = item.attr("title").trim();
            String link = item.getElementsByTag("a").first().attr("href");
            download(link,title);
        }
        return null;
    }

    public String download(String url,String name) throws IOException{
        String hash = "";
        String content = HttpGetConnect.connect(url, "utf-8");
        HtmlManage html = new HtmlManage();
        String regEx = ""hash":"[0-9A-Z]+"";
        // 编译正则表达式
        Pattern pattern = Pattern.compile(regEx);
        Matcher matcher = pattern.matcher(content);
        if (matcher.find()) {
            hash = matcher.group();
            hash = hash.replace(""hash":"", "");
            hash = hash.replace(""", "");
        }
        //爬取歌曲的封面图
        Document doc = html.manage(content);
        Element ele = doc.getElementsByClass("albumImg").get(0);
        String imgUrl = ele.getElementsByTag("img").attr("src");
        //利用hash值构造歌曲mp3地址
        String item = mp3.replace("HASH", hash);
        item = item.replace("TIME", System.currentTimeMillis() + "");

        String mp = HttpGetConnect.connect(item, "utf-8");

        mp = mp.substring(mp.indexOf("(") + 1, mp.length() - 3);

        JSONObject json = JSONObject.fromObject(mp);
        String playUrl = json.getJSONObject("data").getString("play_url");

        System.out.println(playUrl);
        FileDownload fileDownload = new FileDownload();
        fileDownload.download(playUrl,FILEPATH + name + ".mp3");

        String src = null;
        try{
            File file = new File(FILEPATH + name + ".mp3");
            Response response = uploadService.uploadFile(file);
            //解析上传成功的结果
            DefaultPutRet  putRet = new Gson().fromJson(response.bodyString(), DefaultPutRet.class);
            src = "http://www.jie12366.xyz/" + putRet.key;
        }catch (QiniuException e){
            e.printStackTrace();
        }

        //如果图片地址或mp3地址为空,则不爬取(歌曲是收费的无法爬取)
        if (StringUtils.isNotBlank(src) && StringUtils.isNotBlank(imgUrl)){
            musicService.saveMusic(name,imgUrl,src);
        }
        return playUrl;
    }
}

新增FileDownload类:

package com.after.demo.spider;

import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;

import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;

/**
 * @author www.xyjz123.xyz
 * @description
 * @date 2019/4/24 16:37
 */
public class FileDownload {

    /**
     * 文件下载
     * @param url 链接地址
     * @param path 要保存的路径及文件名
     * @return
     */
    public void download(String url,String path){
        CloseableHttpClient httpclient = HttpClients.createDefault();
        RequestConfig requestConfig = RequestConfig.custom().setSocketTimeout(2000)
                .setConnectTimeout(2000).build();

        HttpGet get = new HttpGet(url);
        get.setConfig(requestConfig);

        BufferedInputStream in = null;
        BufferedOutputStream out = null;
        try{
            for(int i=0;i < 3;i++){
                CloseableHttpResponse result = httpclient.execute(get);
                if(result.getStatusLine().getStatusCode() == 200){
                    in = new BufferedInputStream(result.getEntity().getContent());
                    File file = new File(path);
                    out = new BufferedOutputStream(new FileOutputStream(file));
                    byte[] buffer = new byte[1024];
                    int len = -1;
                    while((len = in.read(buffer,0,1024)) > -1){
                        out.write(buffer,0,len);
                    }
                    break;
                }else if(result.getStatusLine().getStatusCode() == 500){
                    continue ;
                }
            }

        }catch(Exception e){
            e.printStackTrace();
        }finally{
            get.releaseConnection();
            try{
                if(in != null){
                    in.close();
                }
                if(out != null){
                    out.close();
                }
            }catch(Exception e){
                e.printStackTrace();
            }
        }
    }
}

entiry:

package com.after.demo.entity;

import com.gitee.sunchenbin.mybatis.actable.annotation.Column;
import com.gitee.sunchenbin.mybatis.actable.annotation.Table;
import com.gitee.sunchenbin.mybatis.actable.constants.MySqlTypeConstant;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;

/**
 * @author www.xyjz123.xyz
 * @description
 * @date 2019/4/19 19:20
 */
@Data
@AllArgsConstructor
@NoArgsConstructor
@Table(name = "music")
public class Music {

    @Column(name = "id",type = MySqlTypeConstant.INT,isKey = true,isAutoIncrement = true,length = 5)
    private int id;

    @Column(name = "name",type = MySqlTypeConstant.VARCHAR,isUnique = true)
    private String name;

    @Column(name = "imgUrl",type = MySqlTypeConstant.VARCHAR,length = 80)
    private String imgUrl;

    @Column(name = "src",type = MySqlTypeConstant.VARCHAR)
    private String src;
}

mapper:

package com.after.demo.mapper;

import com.after.demo.entity.Music;
import org.apache.ibatis.annotations.Insert;
import org.apache.ibatis.annotations.Mapper;
import org.apache.ibatis.annotations.Select;

import java.util.List;

/**
 * @author www.xyjz123.xyz
 * @description
 * @date 2019/4/19 21:03
 */
@Mapper
public interface MusicMapper {

    /**
     * 将爬取的歌曲信息存入数据库
     * @param name 歌曲名
     * @param imgUrl 歌曲封面
     * @param src 歌曲地址
     * @return 是否成功
     */
    @Insert("insert into music(name,imgUrl,src) values(#{name},#{imgUrl},#{src})")
    int saveMusic(String name,String imgUrl,String src);

    /**
     * 获取数据库中的歌曲信息
     * @return list
     */
    @Select("select * from music")
    List<Music> listMusic();

    /**
     * 根据id随机获取一首歌
     * @param id int
     * @return Music
     */
    @Select("select * from music where id=#{id}")
    Music getMusicById(int id);
}

srerivce:

package com.after.demo.service.impl;

import com.after.demo.entity.Music;
import com.after.demo.mapper.MusicMapper;
import com.after.demo.service.MusicService;
import com.after.demo.utils.GetString;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.cache.annotation.CacheConfig;
import org.springframework.cache.annotation.Cacheable;
import org.springframework.stereotype.Service;

import java.util.List;

/**
 * @author www.xyjz123.xyz
 * @description
 * @date 2019/4/19 21:09
 */
@Service
@CacheConfig
public class MusicServiceImpl implements MusicService {

    @Autowired
    MusicMapper musicMapper;

    @Override
    public int saveMusic(String name, String imgUrl, String src) {
        List<Music> musicList = musicMapper.listMusic();
        for (Music music:musicList){
            if (music.getName().equals(name)){
                return 0;
            }
        }
        return musicMapper.saveMusic(name,imgUrl,src);
    }

    @Override
    @Cacheable(value = "music")
    public List<Music> listMusic() {
        return musicMapper.listMusic();
    }

    @Override
    public Music getMusicById(int id) {
        int maxSize = GetString.MAXSIZE;
        if (id <= maxSize){
            return musicMapper.getMusicById(id);
        }
        return null;
    }
}

utils:

package com.after.demo.utils;

/**
 * @author 熊义杰
 * @date 2019-3-16
 */

public class GetString {

    public static final int MAXSIZE = 165;

    public static int getId(){
        int id = (int)(Math.random() * MAXSIZE);
        return id;
    }
}

数据库效果:
在这里插入图片描述

随机获取一首歌:
在这里插入图片描述

版权声明:本文来源CSDN,感谢博主原创文章,遵循 CC 4.0 by-sa 版权协议,转载请附上原文出处链接和本声明。
原文链接:https://blog.csdn.net/qq_40663357/article/details/89469770
站方申明:本站部分内容来自社区用户分享,若涉及侵权,请联系站方删除。
  • 发表于 2020-02-13 12:04:25
  • 阅读 ( 1274 )
  • 分类:数据库

0 条评论

请先 登录 后评论

官方社群

GO教程

猜你喜欢