1.DDLnode
数据库相关操做python
删除数据库
drop database if exists <database name> [restrict|cascade];
--删除数据库,默认状况下,hive不容许删除含有表的数据库,要先将数据库中的表清空才能drop,不然会报错
--加入cascade关键字,能够强制删除一个数据库,默认是restrict,表示有限制的数据库
eg. hive> drop database if exists users cascade;
1.1分区表(PARTITIONED BY)json
分区建表分为2种,一种是单分区,也就是说在表文件夹目录下只有一级文件夹目录。另一种是多分区,表文件夹下出现多文件夹嵌套模式。
1.1.1单分区建表语句:app
create table day_table (id int, content string) partitioned by (dt string);单分区表,按天分区,在表结构中存在id,content,dt三列
1.1.2双分区建表语句:函数
create table day_hour_table (id int, content string) partitioned by (dt string, hour string);双分区表,按天和小时分区,在表结构中新增长了dt和hour两列。
1.1.3导入数据ui
单分区导入数据 LOAD DATA local INPATH '/root/hivedata/dat_table.txt' INTO TABLE day_table partition(dt='2017-07-07'); 多分区导入数据 LOAD DATA local INPATH '/root/hivedata/dat_table.txt' INTO TABLE day_hour_table PARTITION(dt='2017-07-07', hour='08');
1.1.4基于分区的查询:spa
SELECT day_table.* FROM day_table WHERE day_table.dt = '2017-07-07';
1.1.5查看分区:unix
show partitions day_hour_table;
总的说来partition就是辅助查询,缩小查询范围,加快数据的检索速度和对数据按照必定的规格和条件进行管理。rest
1.2ROW FORMAT DELIMITED(指定分隔符)
create table day_table (id int, content string) partitioned by (dt string) row format delimited fields terminated by ','; ---指定分隔符建立分区表 复杂类型的数据表指定分隔符 create table complex_array(name string,work_locations array<string>) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' COLLECTION ITEMS TERMINATED BY ','; 数据以下: zhangsan beijing,shanghai,tianjin,hangzhou wangwu shanghai,chengdu,wuhan,haerbin create table t_map(id int,name string,hobby map<string,string>) row format delimited fields terminated by ',' collection items terminated by '-' map keys terminated by ':' ; 数据: 1,zhangsan,唱歌:很是喜欢-跳舞:喜欢-游泳:通常般 2,lisi,打游戏:很是喜欢-篮球:不喜欢
1.3内部表、外部表
create table student(Sno int,Sname string,Sex string,Sage int,Sdept string) row format delimited fields terminated by ',';
create external table student_ext(Sno int,Sname string,Sex string,Sage int,Sdept string) row format delimited fields terminated by ',' location '/stu';
内、外部表加载数据: load data local inpath '/root/hivedata/students.txt' overwrite into table student; load data inpath '/stu' into table student_ext;
本地模式
set hive.exec.mode.local.auto=true;
1.5分桶表(cluster by into num buckets)
指定开启分桶
set hive.enforce.bucketing = true; set mapreduce.job.reduces=4; TRUNCATE TABLE stu_buck; #先删除表 drop table stu_buck; #在建立 create table stu_buck(Sno int,Sname string,Sex string,Sage int,Sdept string) clustered by(Sno) sorted by(Sno DESC) into 4 buckets row format delimited fields terminated by ',';
分桶表导入数据
insert overwrite table stu_buck select * from student cluster by(Sno);
分桶、排序等查询:cluster by 、sort by、distribute by
select * from student cluster by(Sno); insert overwrite table student_buck select * from student cluster by(Sno) sort by(Sage); 报错,cluster 和 sort 不能共存
对某列进行分桶的同时,根据另外一列进行排序
insert overwrite table stu_buck select * from student distribute by(Sno) sort by(Sage asc);
总结:
cluster(分且排序,必须同样)==distribute(分) + sort(排序)(能够不同)
增长/删除分区
drop table t_partition; create table t_partition(id int,name string) partitioned by (dt string) row format delimited fields terminated by ',';
增长分区
alter table t_partition add partition (dt='2008-08-08') location 'hdfs://node-21:9000/t_parti/';
执行添加分区时 /t_parti文件夹下的数据不会被移动。而且没有分区目录dt=2008-08-08
删除分区
alter table t_partition drop partition (dt='2008-08-08');
执行删除分区时/t_parti下的数据会被删除而且连同/t_parti文件夹也会被删除
注意区别于load data时候添加分区:会移动数据 会建立分区目录
Insert查询语句
多重插入:
create table source_table (id int, name string) row format delimited fields terminated by ','; create table test_insert1 (id int) row format delimited fields terminated by ','; create table test_insert2 (name string) row format delimited fields terminated by ','; from source_table insert overwrite table test_insert1 select id insert overwrite table test_insert2 select name;
动态分区插入
set hive.exec.dynamic.partition=true; #是否开启动态分区功能,默认false关闭。 set hive.exec.dynamic.partition.mode=nonstrict; #动态分区的模式,默认strict,表示必须指定
至少一个分区为静态分区,nonstrict模式表示容许全部的分区字段均可以使用动态分区。
需求:
将dynamic_partition_table中的数据按照时间(day),插入到目标表d_p_t的相应分区中。
原始表:
create table dynamic_partition_table(day string,ip string)row format delimited fields terminated by ","; load data local inpath '/root/hivedata/dynamic_partition_table.txt' into table dynamic_partition_table; 2015-05-10,ip1 2015-05-10,ip2 2015-06-14,ip3 2015-06-14,ip4 2015-06-15,ip1 2015-06-15,ip2
目标表:
create table d_p_t(ip string) partitioned by (month string,day string);
动态插入:
insert overwrite table d_p_t partition (month,day) select ip,substr(day,1,7) as month,day from dynamic_partition_table;
查询结果导出到文件系统
三、将查询结果保存到指定的文件目录(能够是本地,也能够是hdfs)
insert overwrite local directory '/root/123456' select * from t_p; insert overwrite directory '/aaa/test' select * from t_p;
关于hive中的各类join
准备数据
1,a 2,b 3,c 4,d 7,y 8,u 2,bb 3,cc 7,yy 9,pp
建表:
create table a(id int,name string) row format delimited fields terminated by ','; create table b(id int,name string) row format delimited fields terminated by ',';
导入数据:
load data local inpath '/root/hivedata/a.txt' into table a; load data local inpath '/root/hivedata/b.txt' into table b;
实验:
** inner join
select * from a inner join b on a.id=b.id; select a.id,a.name from a join b on a.id = b.id; select a.* from a join b on a.id = b.id; +-------+---------+-------+---------+--+ | a.id | a.name | b.id | b.name | +-------+---------+-------+---------+--+ | 2 | b | 2 | bb | | 3 | c | 3 | cc | | 7 | y | 7 | yy | +-------+---------+-------+---------+--+
**left join
select * from a left join b on a.id=b.id; +-------+---------+-------+---------+--+ | a.id | a.name | b.id | b.name | +-------+---------+-------+---------+--+ | 1 | a | NULL | NULL | | 2 | b | 2 | bb | | 3 | c | 3 | cc | | 4 | d | NULL | NULL | | 7 | y | 7 | yy | | 8 | u | NULL | NULL | +-------+---------+-------+---------+--+
**right join
select * from a right join b on a.id=b.id; select * from b right join a on b.id=a.id; +-------+---------+-------+---------+--+ | a.id | a.name | b.id | b.name | +-------+---------+-------+---------+--+ | 2 | b | 2 | bb | | 3 | c | 3 | cc | | 7 | y | 7 | yy | | NULL | NULL | 9 | pp | +-------+---------+-------+---------+--+
**
select * from a full outer join b on a.id=b.id; +-------+---------+-------+---------+--+ | a.id | a.name | b.id | b.name | +-------+---------+-------+---------+--+ | 1 | a | NULL | NULL | | 2 | b | 2 | bb | | 3 | c | 3 | cc | | 4 | d | NULL | NULL | | 7 | y | 7 | yy | | 8 | u | NULL | NULL | | NULL | NULL | 9 | pp | +-------+---------+-------+---------+--+
**hive中的特别join
select * from a left semi join b on a.id = b.id; select a.* from a inner join b on a.id=b.id; +-------+---------+--+ | a.id | a.name | +-------+---------+--+ | 2 | b | | 3 | c | | 7 | y | +-------+---------+--+
至关于
select a.id,a.name from a where a.id in (select b.id from b); 在hive中效率极低 select a.id,a.name from a join b on (a.id = b.id); select * from a inner join b on a.id=b.id;
cross join(##慎用)
返回两个表的笛卡尔积结果,不须要指定关联键。
select a.,b. from a cross join b;
内置jason函数
select get_json_object(line,'$.movie') as moive,get_json_object(line,'$.rate') as rate from rat_json limit 10;
transform案例:
一、先加载rating.json文件到hive的一个原始表 rat_json
create table rat_json(line string) row format delimited; load data local inpath '/root/hivedata/rating.json' into table rat_json;
二、须要解析json数据成四个字段,插入一张新的表 t_rating
drop table if exists t_rating; create table t_rating(movieid string,rate int,timestring string,uid string) row format delimited fields terminated by '\t'; insert overwrite table t_rating select get_json_object(line,'$.movie') as moive,get_json_object(line,'$.rate') as rate,get_json_object(line,'$.timeStamp') as timestring, get_json_object(line,'$.uid') as uid from rat_json limit 10;
三、使用transform+python的方式去转换unixtime为weekday
先编辑一个python脚本文件
python代码
vi weekday_mapper.py
#!/bin/python import sys import datetime for line in sys.stdin: line = line.strip() movieid, rating, unixtime,userid = line.split('\t') weekday = datetime.datetime.fromtimestamp(float(unixtime)).isoweekday() print '\t'.join([movieid, rating, str(weekday),userid])
保存文件
而后,将文件加入hive的classpath:
hive>add FILE /root/hivedata/weekday_mapper.py;
create table u_data_new as select transform (movieid, rate, timestring,uid) using 'python weekday_mapper.py' as (movieid, rate, weekday,uid) from t_rating; select distinct(weekday) from u_data_new limit 10;
desc formatted student;