豆瓣相信不少人都爬过,我也把个人方法拿出来交流学习,我也是菜鸟过来的,不会省略代码,此教程纯属娱乐,大神勿喷。python
# encoding:UTF-8 import re import requests import MySQLdb from bs4 import BeautifulSoup headers = {'User-Agent' :'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'} def getPage(get_url): r=requests.get(get_url) response = r.text return response def filterpage(): pageCode = getPage(get_url) pattern = re.compile('<em class="">(.*?)</em>.*?<span class="title">(.*?)</span>.*?<span class="rating_num" property="v:average">(.*?)</span>',re.S) items = re.findall(pattern,pageCode) pageStories = [] for item in items: pageStories.append([item[0].strip(),item[1].strip(),item[2].strip()]) return pageStories def save_data(): Dbdata=filterpage() db=MySQLdb.connect(host='localhost',user='root',passwd='******',db='movie',charset='utf8') cursor=db.cursor() for a in Dbdata: id=a[0] name=a[1] grade=a[2] #comment=a[3] value=[id,name,grade] cursor.execute('insert into movie_info values(%s,%s,%s)',value) db.commit() def main(): pageStories=filterpage() #print pageStories for story in pageStories: try: print u"序号:",story[0],u"电影名:",story[1], u"\t评分:", story[2] except: pass if __name__ == '__main__': get_url = 'https://movie.douban.com/top250/' i=1 while i < 11: main() save_data() print u'页码',i num = i * 25 get_url = 'https://movie.douban.com/top250?start=' + str(num) + '&filter=' i = i + 1
[mysqld] tmpdir=F:\mysql-5.7.18-winx64 # 安装目录 datadir=F:\mysql-5.7.18-winx64\data # data目录,没有data目录请手动建立 early-plugin-load="" skip-grant-tables
cd F:\mysql-5.7.18-winx64\bin # 进入bin目录,否则有可能会报错 mysqld initialize # 初始化命令 net start mysql # 启动服务
mysql>create database movie # 建立movie数据库 mysql>use movie # mysql>create table movie_info (id varchar(100),name varchar(100),grade decimal(3,1));
这个教程中,咱们须要爬取豆瓣top250的电影名、序号和相应的评分。F12分析网页结构。找到有《肖申克的救赎》的代码处:mysql
<em class="">1</em> # 序号 . . . <span class="title">肖申克的救赎</span> # 电影名 . . . <span class="rating_num" property="v:average">9.6</span> # 评分
ok,日后翻,能够看到剩下的都是这类的结构,咱们找到规律了,就能够想办法,把里面的东西给搞出来。
通常经常使用的有re模块和beautifulsoup模块,我是用re模块正则匹配,感受正则更强大一点,漂亮汤模块用起来简单,可是速度慢。git
def getPage(get_url): # 定义一个抓取网页的方法 r=requests.get(get_url) # get到目标网页 response = r.text return response # 返回抓取到的网页
def filterpage(): # 页面处理方法 pageCode = getPage(get_url) #传参 pattern = re.compile('<em class="">(.*?)</em>.*?<span class="title">(.*?)</span>.*?<span class="rating_num" property="v:average">(.*?)</span>',re.S) items = re.findall(pattern,pageCode) pageStories = [] # 定义一个空数组,用来存分离出来的数据 for item in items: # 迭代 pageStories.append([item[0].strip(),item[1].strip(),item[2].strip()])# 去除空格 return pageStories
咱们用第一个(.*?)非贪婪匹配来匹配序号,接着用.*?贪婪匹配匹配不须要的代码,日后一样用非贪婪匹配来匹配咱们须要的信息。github
def save_data(): Dbdata=filterpage() #传参 db=MySQLdb.connect(host='localhost',user='root',passwd='suyu.123',db='movie',charset='utf8') # 连接数据库 cursor=db.cursor() # 定义游标 for a in Dbdata: # 迭代存储数据 id=a[0] name=a[1] grade=a[2] value=[id,name,grade] cursor.execute('insert into movie_info values(%s,%s,%s)',value) # sql命令存数据 db.commit() # 别忘记提交变动
def main(): pageStories=filterpage() # 传参 for story in pageStories: # 迭代打印 try: print u"序号:",story[0],u"电影名:",story[1], u"\t评分:", story[2] except: pass
if __name__ == '__main__': get_url = 'https://movie.douban.com/top250/' # 起始网页 i=1 while i < 11: main() # 运行主函数 save_data() # 运行存储函数 print u'页码',i # num = i * 25 get_url = 'https://movie.douban.com/top250?start=' + str(num) + '&filter=' i = i + 1