python之网络爬虫

时间 2019-11-09

标签 python 网络爬虫栏目 Python 繁體版

原文原文链接

1、演绎自已的北爱 html

踏上北漂的航班，开始演奏了我自已的北京爱情故事python

2、爬虫1linux

一、网络爬虫的思路json

首先：指定一个url，而后打开这个url地址，读其中的内容。windows

其次：从读取的内容中过滤关键字；这一步是关键，能够经过查看源代码的方式获取。bash

最后：下载获取的html的url地址，或者图片的url地址保存到本地网络

二、针对指定的url来网络爬虫app

分析：测试

第一步：大约共有4300个下一页。url

第二步：一个页面上有10个我的头像

第三步：一个头像内大约有100张左右的我的图片

指定的淘宝mm的url为：http://mm.taobao.com/json/request_top_list.htm?type=0&page=1

这个页面默认是没有下一页按钮的，咱们能够经过修改其url地址来进行查看下一个页面

最后一页的url地址和页面展现以下图所示：

点击任意一个头像来进入我的的主页，以下图

三、定制的脚本

 
        #!/usr/bin/env python 
       
        #coding:utf-8 
       
        #Author：Allentuns 
       
        #Email：zhengyansheng@hytyi.com 
       
        import  
        urllib 
       
        import  
        os 
       
        import  
        sys 
       
        import  
        time 
       
        ahref =  
        '<a href="' 
       
        ahrefs =  
        '<a href="h' 
       
        ahtml =  
        ".htm" 
       
        atitle =  
        "<img style" 
       
        ajpg =  
        ".jpg" 
       
        btitle =  
        '<img src="' 
       
        page = 0 
       
        while  
        page < 4300:     
        #这个地方能够修改;最大值为4300，我测试的时候写的是3. 
       
        mmurl =  
        "http://mm.taobao.com/json/request_top_list.htm?type=0&page=%d"  
        %(page) 
       
        content = urllib.urlopen(mmurl). 
        read 
        () 
       
        href = content. 
        find 
        (ahref) 
       
        html = content. 
        find 
        (ahtml) 
       
        url = content[href + len(ahref) : html + len(ahtml)] 
       
        print url 
       
        imgtitle = content. 
        find 
        (btitle,html) 
       
        imgjpg = content. 
        find 
        (ajpg,imgtitle) 
       
        littleimgurl = content[imgtitle + len(btitle): imgjpg + len(ajpg)] 
       
        print littleimgurl 
       
        urllib.urlretrieve(littleimgurl, 
        "/www/src/temp/image/taobaomm/allentuns.jpg" 
        ) 
       
        s = 0 
       
        while  
        s < 18: 
       
        href = content. 
        find 
        (ahrefs,html) 
       
        html = content. 
        find 
        (ahtml,href) 
       
        url = content[href + len(ahref): html + len(ajpg)] 
       
        print s,url 
       
        imgtitle = content. 
        find 
        (btitle,html) 
       
        imgjpg = content. 
        find 
        (ajpg,imgtitle) 
       
        littleimgurl = content[imgtitle : imgjpg + len(ajpg)] 
       
        littlesrc = littleimgurl. 
        find 
        ( 
        "src" 
        ) 
       
        tureimgurl = littleimgurl[littlesrc + 5:] 
       
        print s,tureimgurl 
       
        if  
        url. 
        find 
        ( 
        "photo" 
        ) == -1: 
       
        content01 = urllib.urlopen(url). 
        read 
        () 
       
        imgtitle = content01. 
        find 
        (atitle) 
       
        imgjpg = content01. 
        find 
        (ajpg,imgtitle) 
       
        littleimgurl = content01[imgtitle : imgjpg + len(ajpg)] 
       
        littlesrc = littleimgurl. 
        find 
        ( 
        "src" 
        ) 
       
        tureimgurl = littleimgurl[littlesrc + 5:] 
       
        print tureimgurl 
       
        imgcount = content01.count(atitle) 
       
        i = 20 
       
        try: 
       
        while  
        i < imgcount: 
       
        content01 = urllib.urlopen(url). 
        read 
        () 
       
        imgtitle = content01. 
        find 
        (atitle,imgjpg) 
       
        imgjpg = content01. 
        find 
        (ajpg,imgtitle) 
       
        littleimgurl = content01[imgtitle : imgjpg + len(ajpg)] 
       
        littlesrc = littleimgurl. 
        find 
        ( 
        "src" 
        ) 
       
        tureimgurl = littleimgurl[littlesrc + 5:] 
       
        print i,tureimgurl 
       
        time 
        . 
        sleep 
        (1) 
       
        if  
        tureimgurl.count( 
        "<" 
        ) == 0: 
       
        imgname = tureimgurl[tureimgurl.index( 
        "T" 
        ):] 
       
        urllib.urlretrieve(tureimgurl, 
        "/www/src/temp/image/taobaomm/%s-%s"  
        %(page,imgname)) 
       
        else 
        : 
       
        pass 
       
        i += 1 
       
        except IOError: 
       
        print  
        '/nWhy did you do an EOF on me?' 
       
        break 
       
        except: 
       
        print  
        '/nSome error/exception occurred.' 
       
        s += 1 
       
        else 
        : 
       
        print  
        "---------------{< 20;1 page hava 10 htm and pic  }-------------------------}" 
       
        page = page + 1 
       
        print  
        "****************%s page*******************************"  
        %(page) 
       
        else 
        : 
       
        print  
        "Download Finshed."

四、图片展现(部分图片)

五、查看下载的图片数量

2、爬虫2

一、首先来分析url

第一步：总共有7个页面；

第二步：每一个页面有20篇文章

第三步:查看后总共有317篇文章

二、python脚本

脚本的功能:经过给定的url来将这片博客里面的全部文章下载到本地

 
        #!/usr/bin/env python 
       
        #coding: utf-8 
       
        import  
        urllib 
       
        import  
        time 
       
        list00 = [] 
       
        i = j = 0 
       
        page = 1 
       
        while  
        page < 8: 
       
        str =  
        "http://blog.sina.com.cn/s/articlelist_1191258123_0_%d.html"  
        %(page) 
       
        content = urllib.urlopen(str). 
        read 
        () 
       
        title = content. 
        find 
        (r 
        "<a title" 
        ) 
       
        href  = content. 
        find 
        (r 
        "href=" 
        ,title) 
       
        html  = content. 
        find 
        (r 
        ".html" 
        ,href) 
       
        url = content[href + 6:html + 5] 
       
        urlfilename = url[-26:] 
       
        list00.append(url) 
       
        print i,  url 
       
        while  
        title != -1 and href != -1 and html != -1 and i < 350: 
       
        title = content. 
        find 
        (r 
        "<a title" 
        ,html) 
       
        href  = content. 
        find 
        (r 
        "href=" 
        ,title) 
       
        html  = content. 
        find 
        (r 
        ".html" 
        ,href) 
       
        url = content[href + 6:html + 5] 
       
        urlfilename = url[-26:] 
       
        list00.append(url) 
       
        i = i + 1 
       
        print i,  url 
       
        else 
        : 
       
        print  
        "Link address Finshed." 
       
        print  
        "This is %s page"  
        %(page) 
       
        page = page + 1 
       
        else 
        : 
       
        print  
        "spage=" 
        ,list00[50] 
       
        print list00[:51] 
       
        print list00.count( 
        "" 
        ) 
       
        print  
        "All links address Finshed." 
       
        x = list00.count( 
        '' 
        ) 
       
        a = 0 
       
        while  
        a < x: 
       
        y1 = list00.index( 
        '' 
        ) 
       
        list00.pop(y1) 
       
        print a 
       
        a = a + 1 
       
        print list00.count( 
        '' 
        ) 
       
        listcount = len(list00) 
       
        while  
        j < listcount: 
       
        content = urllib.urlopen(list00[j]). 
        read 
        () 
       
        open 
        (r 
        "/tmp/hanhan/" 
        +list00[j][-26:], 
        'a+' 
        ).write(content) 
       
        print  
        "%2s is finshed."  
        %(j) 
       
        j = j + 1 
       
        #time.sleep(1) 
       
        else 
        : 
       
        print  
        "Write to file End."

三、下载文章后的截图

四、从linux下载到windows本地，而后打开查看；以下截图