爬取的数据,须要保存,能够存储在文件中或者数据库中。html
python 字典操做参考:
http://jianwl.com/2017/08/22/%E9%AB%98%E6%95%88%E5%AE%9E%E7%94%A8Python%E5%AD%97%E5%85%B8%E7%9A%84%E6%B8%85%E5%8D%95/python
python 读写参考:
http://www.javashuo.com/article/p-dyjkekrw-cv.htmlmysql
all_house数据结构:all_house[{'house_area':dd,'price':dd,'build_year':dd},{},{}...]sql
f=open('net_saving_data.txt','w'); for item in all_house: # house_area=item['house_area']; # price=item['price']; output='\t'.join([str(item['house_area']),str(item['price']),str(item['build_year'])]); f.write(output); f.write('\n'); f.close();
效果如图:数据库
'\t'.join(["house_area","price","build_year"])
,注意join()内是个列表。CSV(Comma-Separated_values),以逗号分隔值的文件格式,文件以纯文本格式存储表格数据(数字和文本),每一行以换行符分隔,列与列之间用逗号分隔。与txt比较,可以存储的数据大小差很少,可是数据以逗号分隔较整齐,全部python网络爬虫常常用此来存储数据。json
import csv; f=open('net_saving_data.csv','w'); csv_write=csv.writer(f); for item in all_house: csv_write.writerow([item.get('house_area',None),item.get('price',None),item.get('build_year',None)]); #f.write('\n'); f.close();
效果如图:api
如果想在csv中加入key值,操做以下:网络
csv_write.writerow(['house_area',item.get('house_area',None),'price',item.get('price',None),'build_year',item.get('build_year',None)]);
效果如图:数据结构
houses=[['2edr','ser','sge'],['as','hi','hioh','aaajio']]; f=open('saving_data.csv','w'); csv_write=csv.writer(f); for house in houses: csv_write.writerow([item for item in house]); f.close();
效果如图:app
##写入 with open("anjuke_salehouse.json","w",encoding='utf-8') as f: json.dump(all_house,f,ensure_ascii=False); print(u'加载入文件完成...');
-参考:http://www.javashuo.com/article/p-dyjkekrw-cv.html
import csv; houses=[]; with open('net_saving_data.csv','r') as openscv: csv_reader=csv.reader(openscv); for row in csv_reader: houses.append(row); openscv.close(); print houses;
原数据界面:
读取数据界面以下:
##读入 with open("anjuke_salehouse.json",'r',encoding='utf-8') as f: load_dict=json.load(f); print (load_dict);
什么的不要想,直接load出来的就是json文件格式如出一辙的一个对象。 主要是防止乱码的等参数设置。
参考文章:https://blog.csdn.net/shandong_chu/article/details/70173952
with open('net_saving_data.txt','r') as opentxt: txt_reader=opentxt.readlines(); for lin in txt_reader: print (lin);
建库、更删改查,由于下面涉及一些对数据库的操做,如今这里复习一下基本的更删改查
(1)建数据库、建表
create table urls(id int NOT NULL auto_increment,url varchar(1000) NOT NULL,content varchar(4000) NOT NULL,created_time timestamp default current_timestamp,primary key(id)); */
(2)查表结构或查database
describe urls; show databases;
(3)表中插入数据
insert into urls(url,content)values("www.baidu.com","这是内容。") select * from urls where id=1;
(4)从数据表中提取数据
insert into urls(url,content)values("www.blog.com","博客网址。"); select * from urls ;
(5)删除数据
delete from urls where url='www.baidu.com'; select * from urls ;
(6)修改数据
将id=2的content改为博客园
insert into urls(url,content)values("www.santostang.com","Santos blog"); update urls set url='www.blog.com',content="博客园" where id=2; select * from urls ;
(7)语句参考地址:https://blog.csdn.net/ljxfblog/article/details/52066006
select * from a order by id) union (select * from b order by id);
//使用连表查询 SELECT Persons.LastName, Persons.FirstName,Orders.OrderNo FROM Persons, Orders WHERE Persons.Id_P = Orders.Id_P //使用join查询(inner join) SELECT Persons.LastName, Persons.FirstName, Orders.OrderNo FROM Persons INNER JOIN Orders ON Persons.Id_P = Orders.Id_P ORDER BY Persons.LastName
//使用left join查询,只要左表有匹配的条件,就会生成一行,右表的列值为空。 SELECT Persons.LastName, Persons.FirstName, Orders.OrderNo FROM Persons LEFT JOIN Orders ON Persons.Id_P=Orders.Id_P ORDER BY Persons.LastName
//使用right join查询,只要右表有匹配的条件,就会生成一行,左表的列值为空。 SELECT Persons.LastName, Persons.FirstName, Orders.OrderNo FROM Persons RIGHT JOIN Orders ON Persons.Id_P=Orders.Id_P ORDER BY Persons.LastName
//使用full join查询,只要其中一个表中存在匹配,就会生成一行,另外一个表的列值为空。 SELECT Persons.LastName, Persons.FirstName, Orders.OrderNo FROM Persons FULL JOIN Orders ON Persons.Id_P=Orders.Id_P ORDER BY Persons.LastName
alter table urls add created_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP; #增长一列 alter table test modify content char(10) #修改表列类型
参考文献:http://www.javashuo.com/article/p-dnxfmvts-gb.html
在操做数据库的时候,python2中通常使用mysqldb,但在python3中已经不在支持mysqldb了,咱们能够用pymysql和mysql.connector。本文的全部操做都是在python3的pymysql下完成的。python -m pip install pymysql
mysql -u root -p using mysql; select host,user from mysql.user;
mysql的host、user、password等信息。
conn=pymysql.connect(host='localhost',user='root',passwd='123456',db='scraping'); cur=conn.cursor();#获取方法,建立游标 sql='select * from urls'; recount=cur.execute(sql);#操做execute()方法写入sql语句 data=cur.fetchall(); # 返回数据,返回的是tuple类型 print data;
conn=pymysql.connect(host='localhost',user='root',passwd='123456',db='scraping')
用于建立数据库的链接,里面指定参数(用户名,密码,主机信息)。cur=conn.cursor()
经过获取的数据库链接conn下的cursor()方法来建立游标,以后经过游标操做execute()
方法写入纯SQL语句。完成MySQL数据库操做后,须要关闭游标cur和链接conn。conn=pymysql.connect(host='localhost',user='root',passwd='123456',db='scraping'); cur=conn.cursor(); sql1='insert into urls(url,content)values(%s,%s)'; params=('www.sinlang.com','新浪微博'); recount=cur.execute(sql1,params); ##executemany 批量插入 li=[('www.blogs.com','批量插入的第一个'),('www.sou.com','批量插入的第二个')]; sql2='insert into urls(url,content)values(%s,%s)'; recount=cur.executemany(sql2,li); sql3=sql='select * from urls'; recount=cur.execute(sql3); data=cur.fetchall(); conn.commit; cur.close; conn.close; print data; #返回的都是元组((1,'','',time),(2,'','',time)...(6,'','',time));
conn=pymysql.connect(host='localhost',user='root',passwd='123456',db='scraping'); cur=conn.cursor(cursor=pymysql.cursors.DictCursor);# 参数设置 sql='select * from urls'; recount=cur.execute(sql); data=cur.fetchall(); cur.close(); conn.close(); print recount; print data; #返回的是列表含字典[{u'url': 'www.baidu.com', u'content': 'xxx', u'id': 1, u'created_time': datetime.datetime(2018, 8, 22, 22, 2, 23)}, {xxx}, {xxx}];
conn=pymysql.connect(host='localhost',user='root',passwd='123456',db='scraping'); cur=conn.cursor(cursor=pymysql.cursors.DictCursor); sql='select * from urls'; recount=cur.execute(sql); data=cur.fetchall(); for i in range(len(data)): print data[i] cur.close(); conn.close(); print recount;
conn=pymysql.connect(host='localhost',user='root',passwd='123456',db='scraping'); cur=conn.cursor(cursor=pymysql.cursors.DictCursor); sql='select * from urls'; recount=cur.execute(sql); cur.close(); conn.close(); print recount; for i in range(recount): data=cur.fetchone(); print data;
两种方法获取结果都以下:
(1) 在cmd 数据库中先建立database 和相应表;
create database anjuke; use anjuke; create table anjuke (id int not null Auto_increment,house_title varchar(1000) not null,house_layout varchar(1000) not null,house_area int not null,house_levers int not null,brokername varchar (1000),address varchar(2000),price int not null,primary key(id));
(2)将数据插入数据库中,爬取的数据格式以下[{},{},{},{}]。for循环列表,提取每个字典中的信息,创建sql语义传参至execute中。
conn=pymysql.connect(host='localhost',user='root',passwd='123456',db='anjuke'); cur=conn.cursor(); for item in all_house: house_title=item['house_title']; house_layout=item['house_layout']; house_area=item['house_area']; house_levers=item['house_levers']; brokername=item['brokername']; house_address=item['house_address']; price=item['price']; sql='insert into anjuke(house_title,house_layout,house_area,house_levers,address,brokername,price) values (%s,%s,%s,%s,%s,%s,%s)'; #parme=(house_title,house_layout,house_area,house_levers,house_address,brokername,price); #cur.execute(sql,parme); cur.execute(sql,(house_title,house_layout,house_area,house_levers,house_address,brokername,price)); conn.commit(); cur.close(); conn.close();
(3) 读取存入MySQL数据库de网页爬取数据,能够[{},{},{}...{}]或者{}/n{}/n{}/n.../n{}形式输出。
conn=pymysql.connect(host='localhost',user='root',passwd='123456',db='anjuke'); cur=conn.cursor(cursor=pymysql.cursors.DictCursor); sql='select * from anjuke'; cur.execute(sql); conn.close(); cur.close(); data=cur.fetchall();#[{},{},{}...{}] print data;
conn=pymysql.connect(host='localhost',user='root',passwd='123456',db='anjuke'); cur=conn.cursor(cursor=pymysql.cursors.DictCursor); sql='select * from anjuke'; recount=cur.execute(sql); conn.close(); cur.close(); for i in range(recount): data=cur.fetchone(); print data;
conn=mysql.connect()
、获取游标:cur=conn.cursor
、对数据库操做:cur.execute(sql)
、获取数据库:cur.fetchall()
四个操做进行数据库操做。通常获取后,能够不用commit,存入数据等须要conn.commit()
,但都要conn.close()
,cur.close()
。