Python 3.6.3html
myConfig.py数组
1 ''' 2 说明 3 ''' 4 # 被爬取的网站 5 homePageUrl = 'http://bbs.fengniao.com' 6 7 # 存储图片的路径 8 imgFolder = 'C:/L/workspace/FirstPython/src/1809 - PetPhoto/img/' 9 10 # 论坛编号 11 forumNum = '30' # 论坛编号 12 13 # 开始页码、结束页码 14 pageBegin = 1 # 开始页码 15 pageEnd = 2 # 结束页码
index.pyapp
1 ''' 2 3 # 抓取《蜂鸟网》【宠物摄影】栏目的宠物图片 4 5 - - - - - - - - - - - - - - - - - - - - - 6 若是本地测试,不要使用网页另存为,而是要使用打开网页中右键:查看源码->复制、粘贴 7 ''' 8 import myConfig 9 import myList 10 11 ''' = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = 12 # main 13 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ''' 14 if __name__ == '__main__': 15 16 # 循环全部贴子页面 17 for pageNum in range( myConfig.pageBegin, myConfig.pageEnd + 1 ): 18 19 # 网址格式插入的字符串 20 # 首页和之后的页面url格式不太同样 21 if( pageNum == 1 ): 22 s = '' 23 else: 24 s = '_%s_lastpost'% str( pageNum ) 25 26 # 网址 27 url = 'http://bbs.fengniao.com/forum/forum_%s%s.html' % ( myConfig.forumNum, s ) 28 # 生成的网址示例以下: 29 # http://bbs.fengniao.com/forum/forum_30.html 30 # http://bbs.fengniao.com/forum/forum_30_2_lastpost.html 31 32 # 处理每个列表页 33 myList.eachList( url ) 34
myList.pyide
1 ''' 2 《个人列表》 3 - - - - - - - - - - 4 - 列表,一般数据页面的汇总页,如文章列表、帖子列表 5 - 在这个页面里,咱们要整理出来全部页面的一个清单,因此它的下级处理环节就是 myPage(个人页面) 6 - 它的上级页面一般是 index(缺省页,主要作些准备工做) 7 - 要有一个支持本地文件调试的节点,一般取名为:pageHtml( a, b, htmlCode ) 8 - 2018-0928 Livon 9 ''' 10 import re 11 import os 12 import urllib.request 13 import myConfig 14 import myPage 15 16 ''' 17 每个列表页 18 ''' 19 def eachList( listPageUrl ): 20 21 print( listPageUrl ) 22 23 # 获取一个 list 页 24 htmlResponse = urllib.request.urlopen( listPageUrl ) 25 html = htmlResponse.read() 26 html = html.decode('utf8') 27 28 # 解析 html 代码 29 listPageHtml( html ) 30 31 ''' 32 解析列表页的 html 代码 33 ''' 34 def listPageHtml( html ): 35 36 # 帖子列表区域 37 arr_post_list_ul = re.findall("<ul class=\"txtList\">((?:.|\n)*?)</ul>", html ) 38 39 if( len( arr_post_list_ul ) < 1 ): 40 print('未发现列表区域,程序终止。') 41 return 42 43 # 打印每一个区域,仅仅为了查看、调试 44 for i in range( 0, len( arr_post_list_ul )): 45 print('arr_post_list_ul : ' + arr_post_list_ul[i] ) 46 47 print('该页列表区域数量:') 48 print( len( arr_post_list_ul ) ) 49 50 # 第1页有 2 个区域,之后页只有一个区域,共性是:都是最后一个区域是指望的内容 51 ul = arr_post_list_ul[ len(arr_post_list_ul) - 1 ] 52 53 # 帖子列表项 54 arr_post_list_li = re.findall("<li >((?:.|\n)*?)</li>", ul ) 55 56 # 处理每一个帖子 57 for i in range( 0, len( arr_post_list_li ) ): 58 post_list_li( i, arr_post_list_li[i] ) 59 60 61 ''' = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = 62 # 处理一个帖子 63 参数: 64 i: 序号;li: 每个 post 放在一个 li 元素中 65 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ''' 66 def post_list_li( i, li ): 67 68 # 数组:存放全部帖子的地址 69 postPageList = [] 70 71 # 序号 72 print( str( i + 1 ) + ' - arr_post_list_li : ' ) 73 print( li ) 74 75 # 时间 76 time = re.findall('<span class="time">(.*?)</span>', li ) 77 print( ' - 时间 : ' + time[0] ) 78 79 # 做者 80 author = re.findall('<a class="username" href="(.*?)" target="_blank" title="(.*?)">(.*?)</a>', li ) 81 print( ' - 做者 : ' + author[0][1] ) 82 print( ' - 主页 : ' + author[0][0] ) 83 84 # 主题 85 title = re.findall('<a href="(.*?)" target="_blank" title="(.*?)" class="tit(.*?)" style=\'\'>(.*?)</a>', li ) 86 print( ' - 帖子Url : ' + myConfig.homePageUrl + title[0][0] ) 87 print( ' - 帖子Title : ' + title[0][1] ) 88 89 # 添加网址 90 postPageList.append( myConfig.homePageUrl + title[0][0] ) 91 92 # 处理子网址(一个帖子可能有多个页面,也就是多个网址) 93 sub_post = re.findall('<span>\(</span>(.*?)<span>\)</span>', li ) 94 95 # 是否存在多于一个的子网页 96 if( len(sub_post) > 0 ): 97 98 # 存在 99 postPageList.clear() # 清空列表,从新 append(每1次) 100 print( ' - 子页面 Url(第一个子页面与帖子页面内容相同): ' ) 101 102 # 查找全部子网址 103 arr_url = re.findall('<a href="(.*?)" target="_blank">(.*?)</a>', sub_post[0] ) 104 for url in arr_url: 105 print( url[1] + ' - ' + myConfig.homePageUrl + url[0] ) 106 107 # 添加每个子网址 108 postPageList.append( myConfig.homePageUrl + url[0] ) 109 110 # 是否有“最后一页”的连接? 111 if( url[1] == '最后一页' ): 112 113 # 有,以前保留的网址所有做废,从新整理 114 postPageList.clear() # 清空列表,从新 append(每2次) 115 116 # 共有?页 117 pageCount = int( url[0][16:-5]) 118 print( ' 共有页数: ' + str(pageCount) ) 119 for pageNum in range( 0, pageCount ): 120 print( myConfig.homePageUrl + url[0][:16] + str( pageNum ) + url[0][-5:] ) 121 # 添加每个子网址 122 postPageList.append( myConfig.homePageUrl + url[0][:16] + str( pageNum ) + url[0][-5:] ) 123 124 # 列表页的缩略图(该列表页仅显示前5张) 125 # regularExpress = 'style="background-image\:url\((.*?)\?imageView2/2/w/400/q/90/ignore-error/1/\)"></a>' 126 # arr_pic = re.findall( regularExpress, li ) 127 # print( ' - 列表页中的贴子图片(取前5张): ' ) 128 # for pic in arr_pic: 129 # print( ' - ' + pic ) 130 131 # 存储图片的文件夹 132 folderName = '%s《%s》- %s' % ( convertTime( time[0] ), title[0][1], author[0][1] ) 133 # folderName 示例:2018-0924-0642《萌萌哒土拨鼠》-美时美摄 134 135 path = myConfig.imgFolder + folderName 136 print( '存储路径 ====> ' + path ) 137 138 # 若是目录已经存在,说明以前曾经爬取过,若是想从新获取,请手工删除该目录 139 if( os.path.exists( path ) ): 140 # 目录已经存在 141 print( 'ERROR : 目录已经存在,应该是以前曾经获取过,若是想从新抓取,请手动删除该目录。' ) 142 else: 143 # 目录不存在,建立目录 144 os.makedirs( path ) # 若是不存在,则会建立目录 145 146 # 处理该帖子的全部页面,也能够说是获取该文件夹下的图片 147 myPage.postPageList( folderName, postPageList ) 148 149 150 151 152 153 ''' = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = 154 # 转换时间格式 155 由:2018-09-29 10:15:25 156 转成:2018-0929-1015 157 目的:转成合法的目录名 158 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ''' 159 def convertTime( t ): 160 161 t = '%s-%s-%s' % ( t[:4], t[4:10].replace('-',''), t[11:16].replace( ':', '' )) 162 return t 163 164 165 166 167 ''' = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = 168 # main 169 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ''' 170 if __name__ == '__main__': 171 print( convertTime('2018-09-29 10:15:25'))
myPage.pypost
1 ''' 2 《个人页面》 3 - - - - - - - - - - 4 - 页面,通常指具体的数据所在页面,若是是文章就是文章页,若是是论坛就是帖子页(一般一个帖子有多个页面) 5 - 在这个页面里,有咱们要获取的数据,因此一般这个页面的下级处理环节就是 myData(个人数据) 6 - 上级处理环节就是 myList(个人列表) 7 - 一般进入该页面时,会带有N个页面的地址,要逐个页面进行处理 8 - 要有一个支持本地文件调试的节点,一般取名为:pageHtml( a, b, htmlCode ) 9 - 最后一般是处理每个数据对象的方法,这里是处理每一张图片 10 - 2018-0928 Livon 11 ''' 12 13 import re 14 import urllib.request 15 import myConfig 16 import myData 17 18 19 ''' = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = 20 每个 post(帖子),都有至少一个 postPage(贴子页面) 21 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ''' 22 def postPageList( folderName, postPageList ): 23 24 # 处理每个页面的 URL 地址 25 for pageNum in range( 0, len( postPageList ) ): 26 27 # 请求帖子页面,可能只有一页,也多是一个子页 28 htmlResponse = urllib.request.urlopen( postPageList[pageNum] ) 29 html = htmlResponse.read() 30 html = html.decode('utf8') 31 32 # 处理请求到的 html 代码 33 postPageHtml( folderName, pageNum, html ) 34 35 36 37 38 ''' = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = 39 处理页面 html 40 本地 html 文件测试时,调用该方法 41 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ''' 42 def postPageHtml( folderName, pageNum, html ): 43 44 # 页面中的全部图片数组 45 regularExpression = '<img src="(.*?)\?imageView2/2/w/1024/q/90/ignore-error/1/">' 46 arr_picUrl = re.findall( regularExpression, html ) 47 48 # 依次处理每张图片 49 for i in range( 0, len( arr_picUrl )): 50 print( '序号:%s - 图片地址:%s' % ( str(i+1), arr_picUrl[i] )) 51 eachImg( folderName, pageNum, i, arr_picUrl[i] ) 52 53 54 55 56 ''' = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = 57 逐个处理每一张图片 58 参数:目录名、帖子的第N页、该页中的第N张、图片Url 59 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ''' 60 def eachImg( folderName, pageNum, i, imgUrl ): 61 62 # 原图 63 file = '%s%s/%s-%s.jpg' % ( myConfig.imgFolder, folderName, str( pageNum), str(i)) 64 myData.crawl( file, imgUrl ) # 爬取 65 66 # 缩略图 67 file = '%s%s/%s-%s_small.jpg' % ( myConfig.imgFolder, folderName, str( pageNum), str(i)) 68 smallImgUrl = imgUrl + '?imageView2/2/w/1024/q/90/ignore-error/1/' 69 myData.crawl( file, smallImgUrl ) # 爬取 70 71
myData.py测试
1 import urllib.request 2 3 ''' 4 爬取数据 5 - - - - - - - - - - - - - - - - 6 参数:文件(路径+文件名)、图片Url 7 ''' 8 def crawl( file, url ): 9 10 try: 11 img = urllib.request.urlopen( url ) 12 except urllib.error.HTTPError as e: 13 print( e.reason ) 14 else: 15 fp = open( file,'wb') # 以 byte(二进制)方式写入 16 fp.write( img.read() ) 17 fp.close() 18 print( "爬取数据成功" ) 19 20 21 22 ''' 23 测试 24 ''' 25 # if __name__ == '__main__': 26 # 27 # postNum = '001' 28 29