#!/usr/bin/python3 # coding=utf8 import requests from bs4 import BeautifulSoup import pymysql import time ''' 需求:某视频网站,没有搜索功能,我弄个python爬虫爬取网站视频名称和磁力连接,所有爬取下来放到mysql数据库中,就能够按本身喜爱搜索关键字得到影片下载地址进行下载了 做者:xiaoxiaohui 时间:2019-10-03 其余:mysql数据库建立数据库和数据表 mysql -uroot -pxxh123 create database 4hucom; use 4hucom; 数据库id自增加 CREATE TABLE `4hu_shoujixiaoshipin` (`id` INT(11) not null auto_increment,`biaoti` VARCHAR(380), `fabutime` VARCHAR(380), `lianjie` VARCHAR(380),primary key(id) ); 其余2:由于是经过以前一些爬虫代码快速改进,因此关于(1)关于方法名称get_house_info都是沿用以前爬取租房网站的名称啦(2)info字典里面这个'播放地址':fabutime,其实'播放地址'改成bofangdizhi比较好 ''' def get_links(url): response = requests.get(url) soup = BeautifulSoup(response.text,'html.parser') links_div = soup.find_all('li',class_="col-md-2 col-sm-3 col-xs-4") links = ['http://www.网站名马赛克.com'+div.a.get('href') for div in links_div] #print(links) return links def get_house_info(item_url): response = requests.get(item_url) response.encoding = 'utf-8' soup = BeautifulSoup(response.text,'html.parser') links_div = soup.find_all('ul',class_="playul") lianjie_temp = 'http://www.网站名马赛克.com'+links_div[1].li.a.get('href')#爬下载连接 这里注意playul有2个 第一个playul links_div[0]是播放的 第二个playul links_div[1]是下载的 lianjie=get_cililianjie(lianjie_temp) print(lianjie) links_div2 = soup.find_all('div',class_="detail-title fn-clear") biaoti = links_div2[0].text[:].strip() #爬影片名字 我加了.strip() 去空格 #print(biaoti) links_div3 = soup.find_all('ul',class_="playul") fabutime = 'http://www.网站名马赛克.com'+links_div[0].li.a.get('href') #爬影片播放地址 #print(fabutime) info = { 'id':id, '影片名字':biaoti, '播放地址':fabutime, '下载连接':lianjie } return info def get_cililianjie(url): response = requests.get(url) response.encoding = 'utf-8' soup = BeautifulSoup(response.text,'html.parser') #print(soup) links_div = soup.find_all('div',class_="download") #print(links_div) lianjie = links_div[0].a.get('href') #磁力连接 return lianjie def get_db(setting): return pymysql.connect(**setting) def insert(db,house): values_ = "'{}',"*2 + "'{}'" sql_values = values_.format(house['影片名字'],house['播放地址'],house['下载连接']) sql ='insert into 4hu_shoujixiaoshipin (biaoti,fabutime,lianjie) values({})'.format(sql_values) cursor = db.cursor() cursor.execute(sql) db.commit() DATABASE = { 'host':'127.0.0.1', 'database':'4hucom', 'user':'root', 'password':'xxh123', 'charset':'utf8' #以前代码是utf8mb4以后我用navicat.exe查看一直是乱码 改为utf8 发现navicat.exe查是正常中文了 } db = get_db(DATABASE) #链接数据库 #循环全部页例子 for yema in range(1,44): if yema == 1: url = 'https://www.网站名马赛克.com/vod/html7/index.html' else: url = 'https://www.网站名马赛克.com/vod/html7/index_'+str(yema)+'.html' links = get_links(url) for item_url in links: time.sleep(1.0) house = get_house_info(item_url) print('获取一条成功:{}'.format(house['影片名字'])) insert(db,house) #插入爬取到的数据输入进数据库