urllib2抓取HTML存入Excel

时间 2019-11-20

标签 urllib2 urllib 抓取 html 存入 excel 栏目 HTML 繁體版

原文原文链接

经过urllib2抓取HTML网页，而后过滤出包含特定字符的行，并写入Excel文件：编码

# -*- coding: utf-8 -*-

import sys
#import urllib
import urllib2

from xlwt import Workbook

def getdata(keywords, line):
    date = ''
    if keywords in line: # 本行包含keywords
        start = line.find('>',)
        end = line.find('</', start)
        data = line[start+1:end]
        return data
    return False

def FetchDataByUrllib(checkUrl):
    book=Workbook(encoding='gbk')
    # add_sheet新增sheet，默认不能overwrite数据，必须显示指定可更改。
    sheet=book.add_sheet('mySheet', cell_overwrite_ok=True)
        
    try:
        checkFile = urllib2.urlopen(checkUrl)
    except Exception, e:
    
        print e
        return

    type = sys.getfilesystemencoding()
    
    i = 1
    for line in checkFile:
        # 根据网页编码格式来解码
        line = line.decode("UTF-8").encode(type)
        #line = line.decode("GBK").encode(type)

        # 逐行所有写入excel文件。
        #sheet.write(i,1,line)
        #i+=1
        
        # 查找所需的特定数据，写入Excel文件。
        targetStr = getdata('体育', line) # 包含'体育'的行
        if targetStr != False:
            sheet.write(i,1,targetStr)
            i+=1
            
    book.save('simple.xls')
    print 'finish!'

print '开始...'

myUrl = 'http://www.sina.com.cn'

FetchDataByUrllib(myUrl)

输出结果：url