1、使用requests
库访问网址#
2、使用xpath
技术提取网页目标图片网址#
3、通过os
库保存图片到本地#
#coding = utf-8
import requests
import os
from lxml import etree
from urllib.parse import quote,unquote
# import urllib.request
class DmdSpider(object):
name = "dmd"
save_path = 'E:/python/manhuadao/pics/'
__picNo = 0
# 访问漫画岛网页
def start_request(self, url):
header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36'}
r = requests.get(url, params=header)
# print(r.text)
self.__get_imgs(r)
# 自动跳转下一页
html = etree.HTML(r.text)
next_urls = html.xpath('//div[@class="read-bottom"]//a[@class="next"]/@href')
# print(next)
for href in next_urls:
nexturl = href.split('=')[-1]
self.start_request(unquote(nexturl))
# 使用xpath提取网页中的图片网址
def __get_imgs(self, resbonse):
html = etree.HTML(resbonse.text)
# print(html)
# result = etree.tostring(html)
# print(result.decode("utf-8"))
img_srcs = html.xpath('//div[@class="center-t"]//div[@class="main-content"]//img/@src')
# print(img_srcs)
for url in img_srcs:
self.__save_img(url)
# 下载图片
def __save_img(self, url):
# arr = url.split('-')
# picName = arr[-1].split('.')[0]
header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36'}
# request = urllib.request.Request(url, None, header)
# response = urllib.request.urlopen(request)
# with open('E:/python/manhuadao/pics/%s.jpg' %picName, "wb") as f:
# f.write(response.read())
# print(response)
response = requests.get(url, params = header)
# print(response.content)
if not os.path.exists(self.save_path):
os.makedirs(self.save_path)
with open(self.save_path + '%s.jpg'%self.__picNo, "wb") as f:
f.write(response.content)
self.__picNo = self.__picNo + 1
def main():
dmd = DmdSpider()
dmd.start_request('http://www.manhuadao.cn/Comic/ComicView?comicid=58df8c73d401da325c9cf77c&chapterid=9587480')
if __name__ == "__main__":
main()