很多好看的小说只能看不能下载,教你怎么爬取一个网站的所有小说
知识点:
开发环境:
第三方库:
进行网页分析
目标站点:
爬取一章小说
import requests
import parsel
“”“爬取一章小说”""
headers = {
‘User-Agent’: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36’
}
response = requests.get(‘http://www.shuquge.com/txt/8659/2324752.html’, headers=headers)
response.encoding = response.apparent_encoding
html = response.text
print(html)
sel = parsel.Selector(html)
title = sel.css(’.content h1::text’).extract_first()
contents = sel.css(’#content::text’).extract()
contents2 = []
for content in contents:
contents2.append(content.strip())
print(contents)
print(contents2)
print("\n".join(contents2))
with open(title+’.txt’, mode=‘w’, encoding=‘utf-8’) as f:
f.write("\n".join(contents2))
爬取一本小说
import requests
import parsel
“”“获取网页源代码”""
headers = {
‘User-Agent’: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36’
}
def download_one_chapter(target_url):
response = requests.get(target_url, headers=headers)
response.encoding = response.apparent_encoding
html = response.text
“”“从网页源代码里面拿到信息”""
sel = parsel.Selector(html)
title = sel.css(’.content h1::text’).extract_first()
contents = sel.css(’#content::text’).extract()
print(title)
print(contents)
“”" 数据清除 清除空白字符串 “”"
contents1 = [content.strip() for content in contents]
print(contents1)
text = ‘\n’.join(contents1)
print(text)
“”“保存小说内容”""
file = open(title + ‘.txt’, mode=‘w’, encoding=‘utf-8’)
file.write(title)
file.write(text)
file.close()
def get_book_links(book_url):
response = requests.get(book_url)
response.encoding = response.apparent_encoding
html = response.text
sel = parsel.Selector(html)
links = sel.css(‘dd a::attr(href)’).extract()
return links
def get_one_book(book_url):
links = get_book_links(book_url)
for link in links:
print(‘http://www.shuquge.com/txt/8659/’ + link)
download_one_chapter(‘http://www.shuquge.com/txt/8659/’ + link)
if name == ‘main’:
在学习过程中有什么不懂得可以加我的
python学习交流扣扣qun,784758214
群里有不错的学习****、开发工具与电子书籍。
与你分享python企业当下人才需求及怎么从零基础学习好python,和学习什么内容
book_url = ‘http://www.shuquge.com/txt/8659/index.html’
get_one_book(book_url)
爬取全站小说