首页 > 程序开发 > 软件开发 > 其他 >

Webmagic网易云音乐爬虫

2016-11-02

发现了一个java爬虫框架,熟悉了一下写了一个爬取网易云音乐专辑歌曲信息的爬虫,其中歌曲的歌词和评论是动态获取的,我只能单独自己写代码进行请求。

发现了一个java爬虫框架,熟悉了一下写了一个爬取网易云音乐专辑歌曲信息的爬虫,其中歌曲的歌词和评论是动态获取的,我只能单独自己写代码进行请求。

代码如下

import java.io.IOException;  
import java.io.UnsupportedEncodingException;  
import java.math.BigInteger;  
import java.security.SecureRandom;  
import java.text.SimpleDateFormat;  
import java.util.ArrayList;  
import java.util.Date;  
import java.util.List;  
import javax.crypto.Cipher;  
import javax.crypto.spec.IvParameterSpec;  
import javax.crypto.spec.SecretKeySpec;  
import javax.xml.bind.DatatypeConverter;  
import org.apache.commons.codec.binary.Base64;  
import org.apache.http.HttpEntity;  
import org.apache.http.NameValuePair;  
import org.apache.http.client.entity.UrlEncodedFormEntity;  
import org.apache.http.client.methods.CloseableHttpResponse;  
import org.apache.http.client.methods.HttpPost;  
import org.apache.http.impl.client.CloseableHttpClient;  
import org.apache.http.impl.client.HttpClients;  
import org.apache.http.message.BasicNameValuePair;  
import org.apache.http.util.EntityUtils;  
import com.alibaba.fastjson.JSON;  
import com.alibaba.fastjson.JSONPath;  
import us.codecraft.webmagic.Page;  
import us.codecraft.webmagic.Site;  
import us.codecraft.webmagic.Spider;  
import us.codecraft.webmagic.pipeline.ConsolePipeline;  
import us.codecraft.webmagic.pipeline.FilePipeline;  
import us.codecraft.webmagic.processor.PageProcessor;  
  
public class NetEaseMusicPageProcessor implements PageProcessor {  
      
    //正则表达式\\. \\转义java中的\  \.转义正则中的.  
      
    //主域名  
    public static final String BASE_URL = "http://music.163.com/";  
  
    //匹配专辑URL  
    public static final String ALBUM_URL = "http://music\\.163\\.com/album\\?id=\\d+";  
      
    //匹配歌曲URL  
    public static final String MUSIC_URL = "http://music\\.163\\.com/song\\?id=\\d+";  
      
    //初始地址, JAY_CHOU 周杰伦的床边故事专辑, 可以改为其他歌手专辑地址  
    public static final String START_URL = "http://music.163.com/album?id=34720827";  
      
    //加密使用到的文本  
    public static final String TEXT = "{\"username\": \"\", \"rememberLogin\": \"true\", \"password\": \"\"}";  
      
    //爬取结果保存文件路径  
    public static final String RESULT_PATH = "/home/user/workspace/WebMagicCrawler/result/";  
      
    private Site site = Site.me()  
                            .setDomain("http://music.163.com")  
                            .setSleepTime(1000)  
                            .setRetryTimes(30)  
                            .setCharset("utf-8")  
                            .setTimeOut(30000)  
                            .setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");  
  
    @Override  
    public Site getSite() {  
        return site;  
    }  
      
    @Override  
    public void process(Page page) {  
        //根据URL判断页面类型  
        if (page.getUrl().regex(ALBUM_URL).match()) {  
            //爬取歌曲URl加入队列  
            page.addTargetRequests(page.getHtml().xpath("//div[@id=\"song-list-pre-cache\"]").links().regex(MUSIC_URL).all());  
              
            //爬取专辑URL加入队列  
            page.addTargetRequests(page.getHtml().xpath("//div[@class=\"cver u-cover u-cover-3\"]").links().regex(ALBUM_URL).all());  
        } else {  
            String url = page.getUrl().toString();  
            page.putField("title", page.getHtml().xpath("//em[@class='f-ff2']/text()"));  
            page.putField("author", page.getHtml().xpath("//p[@class='des s-fc4']/span/a/text()"));  
            page.putField("album", page.getHtml().xpath("//p[@class='des s-fc4']/a/text()"));  
            page.putField("URL", url);  
              
            //单独对AJAX请求获取评论数, 使用JSON解析返回结果  
            page.putField("commentCount", JSONPath.eval(JSON.parse(crawlAjaxUrl(url.substring(url.indexOf("id=") + 3))), "$.total"));  
        }  
    }  
      
    public static void main(String[] args) {  
        Spider.create(new NetEaseMusicPageProcessor())  
        //初始URL  
        .addUrl(START_URL)  
        .addPipeline(new ConsolePipeline())  
        //结果输出到文件  
        .addPipeline(new FilePipeline(RESULT_PATH))  
        .run();  
    }  
      
    //对AJAX数据进行单独请求  
    public String crawlAjaxUrl(String songId) {   
        CloseableHttpClient httpclient = HttpClients.createDefault();  
        CloseableHttpResponse response =null;  
          
        try {  
            //参数加密  
            String secKey = new BigInteger(100, new SecureRandom()).toString(32).substring(0, 16);  
            String encText = aesEncrypt(aesEncrypt(TEXT, "0CoJUm6Qyw8W8jud"), secKey);  
            String encSecKey = rsaEncrypt(secKey);  
              
            HttpPost httpPost = new HttpPost("http://music.163.com/weapi/v1/resource/comments/R_SO_4_" + songId + "/?csrf_token=");  
            httpPost.addHeader("Referer", BASE_URL);  
              
            List<NameValuePair> ls = new ArrayList<NameValuePair>();  
            ls.add(new BasicNameValuePair("params", encText));  
            ls.add(new BasicNameValuePair("encSecKey", encSecKey));  
              
            UrlEncodedFormEntity paramEntity = new UrlEncodedFormEntity(ls, "utf-8");  
            httpPost.setEntity(paramEntity);  
              
            response = httpclient.execute(httpPost);  
            HttpEntity entity = response.getEntity();  
              
            if (entity != null) {  
                return EntityUtils.toString(entity, "utf-8");  
            }  
              
        } catch (Exception e) {  
            e.printStackTrace();  
        } finally {  
            try {  
                response.close();  
                httpclient.close();  
            } catch (IOException e) {  
                e.printStackTrace();  
            }  
        }  
          
        return "";  
    }  
}
关于网易云音乐API加密可以参见我另外一篇博客。

下面是运行结果

\

[INFO] 2016-10-28 10:51:49 Spider http://music.163.com started!

[INFO] 2016-10-28 10:51:49 downloading page http://music.163.com/album?id=34720827

get page: http://music.163.com/album?id=34720827

[INFO] 2016-10-28 10:51:50 downloading page http://music.163.com/song?id=415792916

get page: http://music.163.com/song?id=415792916

title: 床边故事

author: 周杰伦

album: 周杰伦的床边故事

URL: http://music.163.com/song?id=415792916

commentCount: 52535

[INFO] 2016-10-28 10:51:52 downloading page http://music.163.com/song?id=418602084

get page: http://music.163.com/song?id=418602084

title: 说走就走

author: 周杰伦

album: 周杰伦的床边故事

URL: http://music.163.com/song?id=418602084

commentCount: 24256

[INFO] 2016-10-28 10:51:53 downloading page http://music.163.com/song?id=418603076

get page: http://music.163.com/song?id=418603076

title: 一点点

author: 周杰伦

album: 周杰伦的床边故事

URL: http://music.163.com/song?id=418603076

commentCount: 33127

[INFO] 2016-10-28 10:51:54 downloading page http://music.163.com/song?id=415792918

get page: http://music.163.com/song?id=415792918

title: 前世情人

author: 周杰伦

album: 周杰伦的床边故事

URL: http://music.163.com/song?id=415792918

commentCount: 26761

[INFO] 2016-10-28 10:51:55 downloading page http://music.163.com/song?id=418602085

get page: http://music.163.com/song?id=418602085

title: 英雄

author: 周杰伦

album: 周杰伦的床边故事

URL: http://music.163.com/song?id=418602085

commentCount: 13321

[INFO] 2016-10-28 10:51:57 downloading page http://music.163.com/song?id=417250561

get page: http://music.163.com/song?id=417250561

title: 不该(with aMEI)

author: 周杰伦

album: 周杰伦的床边故事

URL: http://music.163.com/song?id=417250561

commentCount: 37877

[INFO] 2016-10-28 10:51:58 downloading page http://music.163.com/song?id=418602086

get page: http://music.163.com/song?id=418602086

title: 土耳其冰淇淋

author: 周杰伦

album: 周杰伦的床边故事

URL: http://music.163.com/song?id=418602086

commentCount: 21751

[INFO] 2016-10-28 10:51:59 downloading page http://music.163.com/song?id=418603077

get page: http://music.163.com/song?id=418603077

title: 告白气球

author: 周杰伦

album: 周杰伦的床边故事

URL: http://music.163.com/song?id=418603077

commentCount: 117647

[INFO] 2016-10-28 10:52:00 downloading page http://music.163.com/song?id=417247652

get page: http://music.163.com/song?id=417247652

title: Now You See Me

author: 周杰伦

album: 周杰伦的床边故事

URL: http://music.163.com/song?id=417247652

commentCount: 17485

[INFO] 2016-10-28 10:52:01 downloading page http://music.163.com/song?id=418602087

get page: http://music.163.com/song?id=418602087

title: 爱情废柴

author: 周杰伦

album: 周杰伦的床边故事

URL: http://music.163.com/song?id=418602087

commentCount: 24268

[INFO] 2016-10-28 10:52:02 downloading page http://music.163.com/album?id=34685590

get page: http://music.163.com/album?id=34685590

[INFO] 2016-10-28 10:52:04 downloading page http://music.163.com/album?id=34588039

get page: http://music.163.com/album?id=34588039

[INFO] 2016-10-28 10:52:05 downloading page http://music.163.com/album?id=3084335

get page: http://music.163.com/album?id=3084335

[INFO] 2016-10-28 10:52:06 downloading page http://music.163.com/album?id=2662137

get page: http://music.163.com/album?id=2662137

[INFO] 2016-10-28 10:52:07 downloading page http://music.163.com/song?id=411921852

get page: http://music.163.com/song?id=411921852

title: 惊叹号(Live)

author: 周杰伦

album: 魔天伦 世界巡回演唱会

URL: http://music.163.com/song?id=411921852

commentCount: 16768

[INFO] 2016-10-28 10:52:08 downloading page http://music.163.com/song?id=412327297

get page: http://music.163.com/song?id=412327297

title: 龙拳 (Live)

author: 周杰伦

album: 魔天伦 世界巡回演唱会

URL: http://music.163.com/song?id=412327297

commentCount: 2563

[INFO] 2016-10-28 10:52:09 downloading page http://music.163.com/song?id=412327298

get page: http://music.163.com/song?id=412327298

title: 最后的战役 (Live)

author: 周杰伦

album: 魔天伦 世界巡回演唱会

URL: http://music.163.com/song?id=412327298

commentCount: 3849

[INFO] 2016-10-28 10:52:11 downloading page http://music.163.com/song?id=412327299

get page: http://music.163.com/song?id=412327299

title: 天台 (Live)

author: 周杰伦

album: 魔天伦 世界巡回演唱会

URL: http://music.163.com/song?id=412327299

commentCount: 1283

[INFO] 2016-10-28 10:52:12 downloading page http://music.163.com/song?id=412327300

get page: http://music.163.com/song?id=412327300

title: 比较大的大提琴 (Live)

author: 周杰伦

album: 魔天伦 世界巡回演唱会

URL: http://music.163.com/song?id=412327300

commentCount: 1223

[INFO] 2016-10-28 10:52:13 downloading page http://music.163.com/song?id=412327301

get page: http://music.163.com/song?id=412327301

title: 快门慢舞 (Live)

author: 周杰伦

album: 魔天伦 世界巡回演唱会

URL: http://music.163.com/song?id=412327301

commentCount: 1287

[INFO] 2016-10-28 10:52:14 downloading page http://music.163.com/song?id=412327302

get page: http://music.163.com/song?id=412327302

title: 打架舞 (Live)

author: 周杰伦

album: 魔天伦 世界巡回演唱会

URL: http://music.163.com/song?id=412327302

commentCount: null

[INFO] 2016-10-28 10:52:15 downloading page http://music.163.com/song?id=412327303

get page: http://music.163.com/song?id=412327303

title: 哪里都是你 (Live)

author: 周杰伦

album: 魔天伦 世界巡回演唱会

URL: http://music.163.com/song?id=412327303

commentCount: 2674

[INFO] 2016-10-28 10:52:16 downloading page http://music.163.com/song?id=412327304

get page: http://music.163.com/song?id=412327304

title: 一路向北 (Live)

author: 周杰伦

album: 魔天伦 世界巡回演唱会

URL: http://music.163.com/song?id=412327304

commentCount: 11502

[INFO] 2016-10-28 10:52:17 downloading page http://music.163.com/song?id=412327305

get page: http://music.163.com/song?id=412327305

title: 不能说的秘密 (Live)

author: 周杰伦

album: 魔天伦 世界巡回演唱会

URL: http://music.163.com/song?id=412327305

commentCount: 3595

[INFO] 2016-10-28 10:52:18 downloading page http://music.163.com/song?id=412327306

get page: http://music.163.com/song?id=412327306

title: 双截棍 (Live)

author: 周杰伦

album: 魔天伦 世界巡回演唱会

URL: http://music.163.com/song?id=412327306

commentCount: 1892

[INFO] 2016-10-28 10:52:20 downloading page http://music.163.com/song?id=412327307

get page: http://music.163.com/song?id=412327307

title: 明明就 (Live)

author: 周杰伦

album: 魔天伦 世界巡回演唱会

URL: http://music.163.com/song?id=412327307

commentCount: 1898

[INFO] 2016-10-28 10:52:21 downloading page http://music.163.com/song?id=412327308

get page: http://music.163.com/song?id=412327308

title: Mine Mine (Live)

author: 周杰伦

album: 魔天伦 世界巡回演唱会

URL: http://music.163.com/song?id=412327308

commentCount: 1957

[INFO] 2016-10-28 10:52:22 downloading page http://music.163.com/song?id=412327309

get page: http://music.163.com/song?id=412327309

title: 龙卷风 (Live)

author: 周杰伦

album: 魔天伦 世界巡回演唱会

URL: http://music.163.com/song?id=412327309

commentCount: 1923

[INFO] 2016-10-28 10:52:23 downloading page http://music.163.com/song?id=412327310

get page: http://music.163.com/song?id=412327310

title: 公公偏头痛 (Live)

author: 周杰伦

album: 魔天伦 世界巡回演唱会

URL: http://music.163.com/song?id=412327310

commentCount: 1348

[INFO] 2016-10-28 10:52:24 downloading page http://music.163.com/song?id=412327311

get page: http://music.163.com/song?id=412327311

title: 青花瓷 (Live)

author: 周杰伦

album: 魔天伦 世界巡回演唱会

URL: http://music.163.com/song?id=412327311

commentCount: 1916

[INFO] 2016-10-28 10:52:25 downloading page http://music.163.com/song?id=412327312

get page: http://music.163.com/song?id=412327312

title: 斗牛 / 水手怕水 / 大笨钟 (Live)

author: 周杰伦

album: 魔天伦 世界巡回演唱会

URL: http://music.163.com/song?id=412327312

commentCount: 1595

[INFO] 2016-10-28 10:52:27 downloading page http://music.163.com/song?id=412327313

get page: http://music.163.com/song?id=412327313

title: 彩虹 / 轨迹 (Live)

author: 周杰伦

album: 魔天伦 世界巡回演唱会

URL: http://music.163.com/song?id=412327313

commentCount: 3209

[INFO] 2016-10-28 10:52:28 downloading page http://music.163.com/song?id=412327314

get page: http://music.163.com/song?id=412327314

title: 手语 (Live)

author: 周杰伦

album: 魔天伦 世界巡回演唱会

URL: http://music.163.com/song?id=412327314

commentCount: 1403

[INFO] 2016-10-28 10:52:29 downloading page http://music.163.com/song?id=412327315

get page: http://music.163.com/song?id=412327315

title: 开不了口 (Live)

author: 周杰伦

album: 魔天伦 世界巡回演唱会

URL: http://music.163.com/song?id=412327315

commentCount: 3067

[INFO] 2016-10-28 10:52:30 downloading page http://music.163.com/song?id=412327316

get page: http://music.163.com/song?id=412327316

title: 乌克丽丽 (Live)

author: 周杰伦

album: 魔天伦 世界巡回演唱会

URL: http://music.163.com/song?id=412327316

commentCount: 1664

[INFO] 2016-10-28 10:52:32 downloading page http://music.163.com/song?id=412327317

get page: http://music.163.com/song?id=412327317

title: 阳光宅男 (Live)

author: 周杰伦

album: 魔天伦 世界巡回演唱会

URL: http://music.163.com/song?id=412327317

commentCount: 2018

[INFO] 2016-10-28 10:52:33 downloading page http://music.163.com/song?id=407679169

get page: http://music.163.com/song?id=407679169

title: 英雄

author: 周杰伦

album: 英雄

URL: http://music.163.com/song?id=407679169

commentCount: 72800

[INFO] 2016-10-28 10:52:34 downloading page http://music.163.com/song?id=29822016

get page: http://music.163.com/song?id=29822016

title: 阳明山

author: 周杰伦

album: 哎呦,不错哦

URL: http://music.163.com/song?id=29822016

commentCount: 3858

[INFO] 2016-10-28 10:52:35 downloading page http://music.163.com/song?id=29822010

get page: http://music.163.com/song?id=29822010

title: 窃爱

author: 周杰伦

album: 哎呦,不错哦

URL: http://music.163.com/song?id=29822010

commentCount: 5511

[INFO] 2016-10-28 10:52:36 downloading page http://music.163.com/song?id=29818120

get page: http://music.163.com/song?id=29818120

title: 算什么男人

author: 周杰伦

album: 哎呦,不错哦

URL: http://music.163.com/song?id=29818120

commentCount: 24813

[INFO] 2016-10-28 10:52:37 downloading page http://music.163.com/song?id=29822015

get page: http://music.163.com/song?id=29822015

title: 天涯过客

author: 周杰伦

album: 哎呦,不错哦

URL: http://music.163.com/song?id=29822015

commentCount: 10229

[INFO] 2016-10-28 10:52:39 downloading page http://music.163.com/song?id=29822033

get page: http://music.163.com/song?id=29822033

title: 怎么了

author: 周杰伦

album: 哎呦,不错哦

URL: http://music.163.com/song?id=29822033

commentCount: 5023

[INFO] 2016-10-28 10:52:40 downloading page http://music.163.com/song?id=29822032

get page: http://music.163.com/song?id=29822032

title: 一口气全念对

author: 周杰伦

album: 哎呦,不错哦

URL: http://music.163.com/song?id=29822032

commentCount: 5846

[INFO] 2016-10-28 10:52:41 downloading page http://music.163.com/song?id=29822017

get page: http://music.163.com/song?id=29822017

title: 我要夏天

author: 周杰伦

album: 哎呦,不错哦

URL: http://music.163.com/song?id=29822017

commentCount: 5938

[INFO] 2016-10-28 10:52:42 downloading page http://music.163.com/song?id=29822018

get page: http://music.163.com/song?id=29822018

title: 手写的从前

author: 周杰伦

album: 哎呦,不错哦

URL: http://music.163.com/song?id=29822018

commentCount: 22051

[INFO] 2016-10-28 10:52:47 downloading page http://music.163.com/song?id=29818117

get page: http://music.163.com/song?id=29818117

title: 鞋子特大号

author: 周杰伦

album: 哎呦,不错哦

URL: http://music.163.com/song?id=29818117

commentCount: null

[INFO] 2016-10-28 10:52:48 downloading page http://music.163.com/song?id=29822013

get page: http://music.163.com/song?id=29822013

title: 听爸爸的话

author: 周杰伦

album: 哎呦,不错哦

URL: http://music.163.com/song?id=29822013

commentCount: null

[INFO] 2016-10-28 10:52:49 downloading page http://music.163.com/song?id=29822012

get page: http://music.163.com/song?id=29822012

title: 美人鱼

author: 周杰伦

album: 哎呦,不错哦

URL: http://music.163.com/song?id=29822012

commentCount: 10662

[INFO] 2016-10-28 10:52:50 downloading page http://music.163.com/song?id=29822014

get page: http://music.163.com/song?id=29822014

title: 听见下雨的声音

author: 周杰伦

album: 哎呦,不错哦

URL: http://music.163.com/song?id=29822014

commentCount: 17453

[INFO] 2016-10-28 10:52:51 downloading page http://music.163.com/song?id=27698922

get page: http://music.163.com/song?id=27698922

title: 你怎么说+红尘客栈+千里之外

author: 周杰伦

album: 周杰伦2013《魔天伦》台北小巨蛋演唱会

URL: http://music.163.com/song?id=27698922

commentCount: 1915

相关文章
最新文章
热点推荐