get_songs.py

#! /usr/bin/env python
# -*- coding: utf-8 -*-
"""
Author: zhiying
URL: www.zhouzying.cn
Date: 2018-9-9
Update: 2019-8-5
Description: 爬取网易云音乐歌手热门歌曲（更新）
"""
import requests
import csv
from bs4 import BeautifulSoup
from requests import RequestException
import time


def parse_html_page(url):
    """
    :param url: 带有歌手id的url
    :return: 歌手的热门歌曲id以及歌曲名字
    """
    # 使用双引号会出现 Unresolve reference
    # pattern = '<span class="txt"><a href="/song?id=(\d*)"><b title="(.*?)">'
    # 这里是使用lxml解析器进行解析,lxml速度快,文档容错能力强,也能使用html5lib
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
                             'Chrome/66.0.3359.181 Safari/537.36'}
    try:
        r = requests.get(url, headers=headers)
        if r.status_code == 200 and r.text:
            r.encoding = 'utf-8'
            html = r.text
            # 模式为：<ul class="f-hide"><li><a href="/song?id=65766">富士山下</a></li>
            soup = BeautifulSoup(html, 'html5lib')
            ul_tag = soup.find_all('ul', 'f-hide')
            ul_tag = BeautifulSoup(str(ul_tag), 'html5lib')
            items = ul_tag.find_all('li')
            # 返回内容：<li><a href="/song?id=65766">富士山下</a></li>
            return items
    except RequestException as err:
        print(err)
        pass


# 这里以获取薛之谦的热门歌曲为例
# url = "https://music.163.com/artist?id=5781"
# html = get_html_src(url)

# 将获得的歌手的热门歌曲id和名字写入csv文件
def write_to_csv(items, artist_name):

    with open("music163_songs.csv", "a", encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["歌手名字", artist_name])

        for item in items:
            writer.writerow([item.a['href'].replace('/song?id=', ''), item.a.text])

            # 可视化显示
            # print('歌曲id:', item.a['href'].replace('/song?id=', ''))
            # song_name = item.b['title']
            # print('歌曲名字:', song_name)

    csvfile.close()


# 获取歌手id和歌手姓名
def read_csv():

    with open("music163_artists.csv", "r", encoding="utf-8") as csvfile:

        reader = csv.reader(csvfile)
        for row in reader:
            artist_id, artist_name = row
            if str(artist_id) is "artist_id":
                continue
            else:
                yield artist_id, artist_name
    # 当程序的控制流程离开with语句块后, 文件将自动关闭


def main():
    for readcsv in read_csv():
        artist_id, artist_name = readcsv
        print('正在获取{}的热门歌曲'.format(artist_name))
        url = 'https://music.163.com/artist?id=' + str(artist_id)
        items = parse_html_page(url)
        print('正在写入。。。')
        write_to_csv(items, artist_name)
        print('{}的热门歌曲获取成功！'.format(artist_name))
        time.sleep(3)


if __name__ == "__main__":
    main()