python 爬虫入门到进阶（五）

实战项目妹子图爬取

多图预警

就不放太多了，等下可以自己尝试一下

打开我们要爬去的网站，f12 打开控制台

导入头

# -*- coding: utf-8 -*-
import urllib2
import os
import sys
from bs4 import BeautifulSoup

设置编码和头

sys.setdefaultencoding('utf8')
url = 'http://www.mzitu.com'
#头
heade = {
    'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36',
    'Referer':'http://www.mzitu.com',
}
#构造请求
request = urllib2.Request(url,headers=heade)
respons = urllib2.urlopen(request)

定义存储路径

1	path = "/home/gwj/snap/demo/image/"

解析获取的页面

1 2	soup = BeautifulSoup(respons.read(),"lxml") print soup.a

这里打印出 a 标签，lxml 是解析器，respons.read () 存储页面的内容

由于我们要进入到每个子页面才能保存图片，这里我们要先找到最大页码，以此决定循环次数（每次打开子页面，保存图片是一个循环）

1
2
3

#最大页数在span标签中的第１0个
page = soup.find_all('a',class_ = 'page-numbers')
max_page = page[-2].text

解释一下 soup.find() 查找某一个，soup.find_all 查找所有的，返回一个列表

soup.find‘img’ 查找 img 的 src 链接属性

class_ 获取目标的类名

不同页面之间都有相似的地方

这里存储一下这个 url，用来构建访问子页面的链接

1	same_url = 'http://www.mzitu.com/page/'

循环访问子页面格式为 page+n

for n in range(1,int(max_page)+1):
    ul = same_url+str(n)
    request1 = urllib2.Request(ul,headers=heade)
    start_html = urllib2.urlopen(request1)
    soup = BeautifulSoup(start_html,"lxml")
    #实际上时第一个class = 'postlist'的div里的所有a标签
    all_a = soup.find('div',class_ ='postlist').find_all('a',target='_blank')

访问详情页面，提取用户名，URL

#提取用户名
for a in all_a:
    title = a.get_text() #提取文本
    if(title!=''):
        print ("准备爬取:"+title)
        if(os.path.exists(path+title.strip().replace('?',''))):
            print("目录已存在")
            flag=1
        else:
            os.makedirs(path+title.strip().replace('?',''))
            flag=0
        os.chdir(path + title.strip().replace('?',''))
        href = a['href']
        request2 = urllib2.Request(href,headers=heade)
        html = urllib2.urlopen(request2)
        mess = BeautifulSoup(html,"lxml")
        pic_max = mess.find_all('span')
        if len(pic_max):
            pic_max = pic_max[10].text #最大页数
        else:
            pic_max = '10'
        if(flag == 1 and len(os.listdir(path+title.strip().replace('?',''))) >= int(pic_max)):
            print('已经保存完毕，跳过')
            continue

上方第 18 行代码，判断数据长度是否为空，这里是记录单个妹子的图片最大页数，同时本页包含了图片的 url

第 12 行代码，创建 path + 文件名的新目录

22 行，判断目录是否已经创建，如果已经创建，则跳出本次循环

如有不理解可在评论区评论

获取图片链接

for num in range(1,int(pic_max)+1):
    pic = href+'/'+str(num)
    request = urllib2.Request(pic,headers=heade)
    html = urllib2.urlopen(request)
    mess = BeautifulSoup(html,"lxml")
    pic_url = mess.find('img',alt = title)

判断链接是否为空（防止报错）

if pic_url is None:
    continue
else:
    request = urllib2.Request(pic_url['src'],headers=heade)

截取链接'/ 后的字符作为文件名，写入文件

file_name = pic_url['src'].split(r'/')[-1]
f = open(file_name,'wb')
f.write(html.read())
f.close()

完整代码

# -*- coding: utf-8 -*-
import urllib2
import os
import sys
from bs4 import BeautifulSoup

reload(sys)
sys.setdefaultencoding('utf8')
url = 'http://www.mzitu.com'
#头
heade = {
    'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36',
    'Referer':'http://www.mzitu.com',
}

request = urllib2.Request(url,headers=heade)
#########
#get方式提交的另一种方法
#request.add_header('User-Agent','Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36')
#request.add_header('Referer','http://www.mzitu.com/')
######
respons = urllib2.urlopen(request)
path = "/home/gwj/snap/demo/image/"
#print respons.read()


#解析
#使用自带的html.parser解析，速度慢但通用,这里用ｌｘｍｌ解析器
soup = BeautifulSoup(respons.read(),"lxml")
print soup.a
#最大页数在span标签中的第１0个
page = soup.find_all('a',class_ = 'page-numbers')
max_page = page[-2].text


same_url = 'http://www.mzitu.com/page/'
for n in range(1,int(max_page)+1):
    ul = same_url+str(n)
    request1 = urllib2.Request(ul,headers=heade)
    start_html = urllib2.urlopen(request1)
    soup = BeautifulSoup(start_html,"lxml")
    #实际上时第一个class = 'postlist'的div里的所有a标签
    all_a = soup.find('div',class_ ='postlist').find_all('a',target='_blank')
    #提取用户名
    for a in all_a:
        title = a.get_text() #提取文本
        if(title!=''):
            print ("准备爬取:"+title)
            if(os.path.exists(path+title.strip().replace('?',''))):
                print("目录已存在")
                flag=1
            else:
                os.makedirs(path+title.strip().replace('?',''))
                flag=0
            os.chdir(path + title.strip().replace('?',''))
            href = a['href']
            request2 = urllib2.Request(href,headers=heade)
            html = urllib2.urlopen(request2)
            mess = BeautifulSoup(html,"lxml")
            pic_max = mess.find_all('span')
            if len(pic_max):
                pic_max = pic_max[10].text #最大页数
            else:
                pic_max = '10'
            if(flag == 1 and len(os.listdir(path+title.strip().replace('?',''))) >= int(pic_max)):
                print('已经保存完毕，跳过')
                continue
            for num in range(1,int(pic_max)+1):
                pic = href+'/'+str(num)
                request = urllib2.Request(pic,headers=heade)
                html = urllib2.urlopen(request)
                mess = BeautifulSoup(html,"lxml")
                pic_url = mess.find('img',alt = title)
                if pic_url is None:
                    continue
                else:
                    request = urllib2.Request(pic_url['src'],headers=heade)

                html = urllib2.urlopen(request)
                file_name = pic_url['src'].split(r'/')[-1]
                f = open(file_name,'wb')
                f.write(html.read())
                f.close()
            print "完成"
    print "第",n,"页完成"

第一次写不太完善，有问题评论

下一节：另一种方式爬取妹子图

表情包获取