点击蓝字“python教程”关注咱们哟!
用python实现的抓取腾讯视频全部电影的爬虫html
##完整代码python
# -*- coding: utf-
8
-*-
微信
import
re
工具
import
urllib2
post
from bs4
import
BeautifulSoup
学习
import
string, time
开发工具
import
pymongo
flex
NUM =
0
#全局变量,电影数量
网站
m_type = u
''
#全局变量,电影类型
url
m_site = u
'qq'
#全局变量,电影网站
#根据指定的URL获取网页内容
def gethtml(url):
req = urllib2.Request(url)
response = urllib2.urlopen(req)
html = response.read()
return
html
#从电影分类列表页面获取电影分类
def gettags(html):
global m_type
soup = BeautifulSoup(html) #过滤出分类内容
#print soup
#<ul
class
=
"clearfix _group"
gname=
"mi_type"
gtype=
"1"
>
tags_all = soup.find_all(
'ul'
, {
'class'
:
'clearfix _group'
,
'gname'
:
'mi_type'
})
#print len(tags_all), tags_all
#print str(tags_all[
1
]).replace(
'\n'
,
''
)
#<a _hot=
"tag.sub"
class
=
"_gtag _hotkey"
href=
"http://v.qq.com/list/1_0_-1_-1_1_0_0_20_0_-1_0.html"
title=
"动做"
tvalue=
"0"
>动做</a>
re_tags = r
'<a _hot=\"tag\.sub\" class=\"_gtag _hotkey\" href=\"(.+?)\" title=\"(.+?)\" tvalue=\"(.+?)\">.+?</a>'
p = re.compile(re_tags, re.DOTALL)
tags = p.findall(str(tags_all[
0
]))
if
tags:
tags_url = {}
#print tags
for
tag
in
tags:
tag_url = tag[
0
].decode(
'utf-8'
)
#print tag_url
m_type = tag[
1
].decode(
'utf-8'
)
tags_url[m_type] = tag_url
else
:
print
"Not Find"
return
tags_url
#获取每一个分类的页数
def get_pages(tag_url):
tag_html = gethtml(tag_url)
#div
class
="paginator
soup = BeautifulSoup(tag_html) #过滤出标记页面的html
#print soup
#<div
class
=
"mod_pagenav"
id=
"pager"
>
div_page = soup.find_all(
'div'
, {
'class'
:
'mod_pagenav'
,
'id'
:
'pager'
})
#print div_page #len(div_page), div_page[
0
]
#<a
class
=
"c_txt6"
href=
"http://v.qq.com/list/1_2_-1_-1_1_0_24_20_0_-1_0.html"
title=
"25"
><span>
25
</span></a>
re_pages = r
'<a class=.+?><span>(.+?)</span></a>'
p = re.compile(re_pages, re.DOTALL)
pages = p.findall(str(div_page[
0
]))
#print pages
if
len(pages) >
1
:
return
pages[-
2
]
else
:
return
1
def getmovielist(html):
soup = BeautifulSoup(html)
#<ul
class
=
"mod_list_pic_130"
>
divs = soup.find_all(
'ul'
, {
'class'
:
'mod_list_pic_130'
})
#print divs
for
div_html
in
divs:
div_html = str(div_html).replace(
'\n'
,
''
)
#print div_html
getmovie(div_html)
def getmovie(html):
global NUM
global m_type
global m_site
re_movie = r
'<li><a class=\"mod_poster_130\" href=\"(.+?)\" target=\"_blank\" title=\"(.+?)\"><img.+?</li>'
p = re.compile(re_movie, re.DOTALL)
movies = p.findall(html)
if
movies:
conn = pymongo.Connection(
'localhost'
,
27017
)
movie_db = conn.dianying
playlinks = movie_db.playlinks
#print movies
for
movie
in
movies:
#print movie
NUM +=
1
print
"%s : %d"
% (
"="
*
70
, NUM)
values = dict(
movie_title = movie[
1
],
movie_url = movie[
0
],
movie_site = m_site,
movie_type = m_type
)
print values
playlinks.insert(values)
print
"_"
*
70
NUM +=
1
print
"%s : %d"
% (
"="
*
70
, NUM)
#
else
:
# print
"Not Find"
def getmovieinfo(url):
html = gethtml(url)
soup = BeautifulSoup(html)
#pack pack_album album_cover
divs = soup.find_all(
'div'
, {
'class'
:
'pack pack_album album_cover'
})
#print divs[
0
]
#<a href=
"http://www.tudou.com/albumplay/9NyofXc_lHI/32JqhiKJykI.html"
target=
"new"
title=
"《血滴子》独家纪录片"
wl=
"1"
> </a>
re_info = r
'<a href=\"(.+?)\" target=\"new\" title=\"(.+?)\" wl=\".+?\"> </a>'
p_info = re.compile(re_info, re.DOTALL)
m_info = p_info.findall(str(divs[
0
]))
if
m_info:
return
m_info
else
:
print
"Not find movie info"
return
m_info
def insertdb(movieinfo):
global conn
movie_db = conn.dianying_at
movies = movie_db.movies
movies.insert(movieinfo)
if
__name__ ==
"__main__"
:
global conn
tags_url =
"http://v.qq.com/list/1_-1_-1_-1_1_0_0_20_0_-1_0.html"
#print tags_url
tags_html = gethtml(tags_url)
#print tags_html
tag_urls = gettags(tags_html)
#print tag_urls
for
url
in
tag_urls.items():
print str(url[
1
]).encode(
'utf-8'
) #,url[
0
]
maxpage =
int
(get_pages(str(url[
1
]).encode(
'utf-8'
)))
print maxpage
for
x
in
range(
0
, maxpage):
#http:
//v.qq.com/list/1_0_-1_-1_1_0_0_20_0_-1_0.html
m_url = str(url[
1
]).replace(
'0_20_0_-1_0.html'
,
''
)
movie_url =
"%s%d_20_0_-1_0.html"
% (m_url, x)
print movie_url
movie_html = gethtml(movie_url.encode(
'utf-8'
))
#print movie_html
getmovielist(movie_html)
time.sleep(
0.1
)
注意事项
对Python开发技术感兴趣的同窗,欢迎加下方的交流群一块儿学习,相互讨论。
学习python过程当中有不懂的能够加入个人python零基础系统学习交流秋秋qun:934109170,与你分享Python企业当下人才需求及怎么从零基础学习Python,和学习什么内容。相关学习视频资料、开发工具都有分享
好啦!文章就给看官们分享到这儿
最后,若是以为有帮助,记得关注、转发、收藏哟
本文分享自微信公众号 - python教程(pythonjc)。
若有侵权,请联系 support@oschina.cn 删除。
本文参与“OSC源创计划”,欢迎正在阅读的你也加入,一块儿分享。