python爬虫初探

时间 2020-01-01

标签 python 爬虫初探栏目 Python 繁體版

原文原文链接

使用requests库进行javascript

import requests

target='https://www.csdn.net/'
req=requests.get(target)
print(req.text)

输出：html

</head>
<body data-category="home" data-host_type="www">
    <script id="toolbar-tpl-scriptId" prod="download" skin="black" src="//csdnimg.cn/public/common/toolbar/js/content_toolbar.js" type="text/javascript" domain="http://blog.csdn.net"></script>
    <div class="container clearfix">
      <nav id="nav" class="clearfix">
        <div class="clearfix">
        <div class="nav_com">
          <ul>
                  <li class="active"><a href="/">推荐</a></li>
                      <li class=""><a href="/nav/watchers">关注</a></li>
                      <li class=""><a href="/nav/career">程序人生</a></li>

使用爬虫爬取csdn博客html文件java

import requests
import re
import time
import numpy as np

获取指定连接的html内容

def getHtml(url):
    res = requests.get(url,timeout=2,headers={'User-Agent':'Baiduspider'})
    return res.text

获取csdn目录页内部的各个blog的连接

def getURL(list_html):
    begin = """<h4 class=\"\">
        <a href=\""""
    end = """\" target=\"_blank\">"""
    r = r'(?<=' + begin + ').*(?=' + end + ')'

    res = re.findall(r, list_html)
    return res

文件形式保存html，url为本地保存地址

def saveFile(file, url):
    fout = open(url, 'w', encoding='UTF-8')
    fout.write(file)
    fout.close()


def loadFile(url):
    fread = open(url, 'r', encoding='utf-8')
    file = fread.read()
    fread.close()
    return file

获取html文件的标题

def getTitle(html):
    return re.search(r'(?<=<title>).*(?=_)', html)[0]


# 全部的博客连接
blog_urls = []


# 获取博客连接
def Init():
    # 博客页数
    page = 36
    for index in range(1, page + 1, 1):
        list_url = 'https://jkchen.blog.csdn.net/article/list/' + index.__str__()
        list_html = getHtml(list_url)
        blog_url_ar = getURL(list_html)
        for url in blog_url_ar:
            blog_urls.append(url)
    np.save('blog_url.npy', blog_urls)


if __name__ == '__main__':
    # 是否须要更新目录
    refresh = False
    if refresh:
        Init()

    # 是否须要保存html源文件（文件夹须要先建立好）
    toSave = False
    saveUrl = 'HTMLs/'

    blog_urls = np.load('blog_url.npy')

    epoch = 100
    for T in range(epoch):
        np.random.shuffle(blog_urls)
        index = 0
        for url in blog_urls:
            index += 1
            while(True):
                try:
                    html = getHtml(url)
                    break
                except:
                    print("Banned, and retry. ")
                    time.sleep(4)
            title = getTitle(html)

            if toSave:
                saveFile(html, saveUrl + title + '.html')
            print('epoch: {}, index: {}, title: {}'.format(T + 1, index, title))
            time.sleep(10*np.random.rand())