以‘小小呵’作品为例
作品首页:http://5sing.kugou.com/38608764/fc/1.html
话不多说,贴代码
import sys
import time
import urllib.parse
from urllib import request
from bs4 import BeautifulSoup
import re
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, compress',
'Accept-Language': 'en-us;q=0.5,en;q=0.3',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:22.0) Gecko/20100101 Firefox/22.0'
} #定义头文件,伪装成浏览器
def spider(singer,page):#根据歌手id获取翻唱作品
for pg in range(page):
print('\n*****************************page '+str(pg+1)+' *****************************')
url = 'http://5sing.kugou.com/'+str(singer)+'/fc/' + str(pg+1) + '.html'
#print(url)
time.sleep(5)
response = urllib.request.urlopen(url)
soup = BeautifulSoup(response.read(), 'lxml')
for item in soup.select(' ul > li > strong > a'):
print('\n'+item.text +' '+ item.get('href'))#获取翻唱作品
id=item.get('href').split('/')[-1].split('.')[0]
#print(item.text+' '+id)#获取翻唱作品id
durl='http://service.5sing.kugou.com/song/getSongUrl?version=0&songid='+id+'&songtype=fc'#构造下载链接前置接口链接
jiexi(item.text,durl)
def jiexi(name,url):
try:
response = urllib.request.urlopen(url)
soup = BeautifulSoup(response.read(), 'lxml')
dl=soup.select('body>p')[0].text
#print(dl)
ddl=re.search(r'http.*mp3',dl, flags=0)[0].split('"')[0]#下载链接
#print(name,ddl)
download(name,ddl)
except:
print('>>>sleep(10)')
time.sleep(10)
response = urllib.request.urlopen(url)
soup = BeautifulSoup(response.read(), 'lxml')
dl = soup.select('body>p')[0].text
# print(dl)
ddl = re.search(r'http.*mp3', dl, flags=0)[0].split('"')[0] # 下载链接
# print(name,ddl)
download(name, ddl)
def download(name,url):
durl = re.sub('\\\/', '/', url)
Path='D:\/song\/'+name+'__by小小呵.mp3'#定制文件存放位置与文件名
request.urlretrieve(durl, Path,_progress)
time.sleep(1)
def _progress(block_num, block_size, total_size):#显示下载进度
'''回调函数
@block_num: 已经下载的数据块
@block_size: 数据块的大小
@total_size: 远程文件的大小
'''
sys.stdout.write('\r>> Downloading %s %.1f%%' % ('',float(block_num * block_size) / float(total_size) * 100.0))
sys.stdout.flush()
if __name__ == "__main__":
singer=38608764
page=19
spider(singer,page)