// 建立大表 create table bigtable(id bigint, t bigint, uid string, keyword string, url_rank int, click_num int, click_url string) row format delimited fields terminated by '\t';
// 建立小表 create table smalltable(id bigint, t bigint, uid string, keyword string, url_rank int, click_num int, click_url string) row format delimited fields terminated by '\t';
// 建立 JOIN 后表 create table jointable(id bigint, t bigint, uid string, keyword string, url_rank int, click_num int, click_url string) row format delimited fields terminated by '\t';
load data local inpath '/opt/module/data/bigtable' into table bigtable;
load data local inpath '/opt/module/data/smalltable' into table smalltable;
3)数据下载地址:node
EXPLAIN [EXTENDED | DEPENDENCY | AUTHORIZATION] query-sql
explain select * from bigtable;
explain select click_url, count(*) ct from bigtable group by click_url;
explain extended select * from bigtable;
explain extended select click_url, count(*) ct from bigtable group by click_url;
dept_20200401.log dept_20200402.log dept_20200403.log
create table dept_partition( deptno int, dname string, loc string ) partitioned by (day string) row format delimited fields terminated by '\t';
10 ACCOUNTING 1700 20 RESEARCH 1800
30 SALES 1900 40 OPERATIONS 1700
50 TEST 2000 60 DEV 1900
load data local inpath '/opt/module/data/dept_20200401.log' into table dept_partition partition(day='20200401');
load data local inpath '/opt/module/data/dept_20200402.log' into table dept_partition partition(day='20200402');
load data local inpath '/opt/module/data/dept_20200403.log' into table dept_partition partition(day='20200403');
select * from dept_partition where day='20200401';
select * from dept_partition where day='20200401' union select * from dept_partition where day='20200402' union select * from dept_partition where day='20200403';
select * from dept_partition where day='20200401' or day='20200402' or day='20200403';
alter table dept_partition add partition(day='20200404');
(2)同时增长多个分区算法
alter table dept_partition add partition(day='20200405') partition(day='20200406');
alter table dept_partition drop partition(day='20200406');
(2)同时删除多个分区sql
alter table dept_partition drop partition(day='20200404'),partition(day='20200405');
show partitions dept_partition;
desc formatted dept_partition;
create table dept_partition2( deptno int, dname string, loc string) partitioned by (day string, hour string) row format delimited fields terminated by '\t';
load data local inpath '/opt/module/data/dept_20200401.log' into table dept_partition2 partition(day='20200401', hour='12');
select * from dept_partition2 where day='20200401' and hour='12';
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nonstrict;
set hive.exec.max.dynamic.partitions=1000;
set hive.exec.max.dynamic.partitions.pernode=400;
set hive.exec.max.created.files=100000;
set hive.error.on.empty.partition=false;
create table dept_partition_dy(id int, name string) partitioned by (loc int) row format delimited fields terminated by '\t';
set hive.exec.dynamic.partition.mode = nonstrict;
insert into table dept_partition_dy partition(loc) select deptno, dname, loc from dept_partition;
show partitions dept_partition_dy;
vim /opt/module/data/student.txt
1001 ss1 1002 ss2 1003 ss3 1004 ss4 1005 ss5 1006 ss6 1007 ss7 1008 ss8 1009 ss9 1010 ss10 1011 ss11 1012 ss12 1013 ss13 1014 ss14 1015 ss15 1016 ss16
create table stu_buck(id int, name string) clustered by(id) into 4 buckets row format delimited fields terminated by ' ';
desc formatted stu_buck;
load data local inpath '/opt/module/data/student.txt' into table stu_buck;
select * from stu_buck;
insert into table stu_buck select * from student_insert;
TABLESAMPLE(BUCKET x OUT OF y)
select * from stu_buck tablesample(bucket 1 out of 4 on id);
hive (test)> select * from stu_buck tablesample(bucket 5 out of 4 on id); FAILED: SemanticException [Error 10061]: Numerator should not be bigger than denominator in sample clause for table stu_buck
压缩格式 对应的编码/解码器 DEFLATE org.apache.hadoop.io.compress.DefaultCodec gzip org.apache.hadoop.io.compress.GzipCodec bzip2 org.apache.hadoop.io.compress.BZip2Codec LZO com.hadoop.compression.lzo.LzopCodec Snappy org.apache.hadoop.io.compress.SnappyCodec
压缩性能的比较:数据库
set hive.map.aggr = true;
set hive.groupby.mapaggr.checkinterval = 100000;
set hive.groupby.skewindata = true;
create table emp(empno int,empname string,deptno int) partitioned by (day string) row format delimited fields terminated by ' ';
vim /opt/module/data/emp.txt
1 aa 10 2 bb 20 3 cc 30
load data local inpath '/opt/module/data/emp.txt' into table emp;
select deptno from emp group by deptno;
set hive.groupby.skewindata = true;
select deptno from emp group by deptno;
set hive.vectorized.execution.enabled = true; set hive.vectorized.execution.reduce.enabled = true;
insert .... select id,name,sex, age from student where age > 17; insert .... select id,name,sex, age from student where age > 18; insert .... select id,name,sex, age from student where age > 19;
insert int t_ptn partition(city=A). select id,name,sex, age from student where city= A; insert int t_ptn partition(city=B). select id,name,sex, age from student where city= B; insert int t_ptn partition(city=c). select id,name,sex, age from student where city= c;
from student insert int t_ptn partition(city=A) select id,name,sex, age where city= A insert int t_ptn partition(city=B) select id,name,sex, age where city= B
select a.id, a.name from a where a.id in (select b.id from b); select a.id, a.name from a where exists (select id from b where a.id =b.id);
select a.id, a.name from a join b on a.id = b.id;
select a.id, a.name from a left semi join b on a.id = b.id;
select a.*, b.*, c.* from a join b on a.id = b.id join c on a.id = c.id;
set hive.cbo.enable=true; set hive.compute.query.using.stats=true; set hive.stats.fetch.column.stats=true; set hive.stats.fetch.partition.stats=true;
#谓词下推,默认是 true set hive.optimize.ppd = true;
explain select o.id from bigtable b join bigtable o on o.id = b.id where o.id <= 10;
explain select b.id from bigtable b join (select id from bigtable where id <= 10) o on b.id = o.id;
#默认为 true set hive.auto.convert.join=true;
set hive.mapjoin.smalltable.filesize=25000000;
#默认为 true set hive.auto.convert.join = true;
Explain insert overwrite table jointable select b.id, b.t, b.uid, b.keyword, b.url_rank, b.click_num, b.click_url from smalltable s left join bigtable b on s.id = b.id;
Time taken: 52.581 seconds
Explain insert overwrite table jointable select b.id, b.t, b.uid, b.keyword, b.url_rank, b.click_num, b.click_url from bigtable b left join smalltable s on s.id = b.id;
Time taken: 55.997 seconds
create table bigtable2( id bigint, t bigint, uid string, keyword string, url_rank int, click_num int, click_url string) row format delimited fields terminated by '\t'; load data local inpath '/opt/module/data/bigtable' into table bigtable2;
insert overwrite table jointable select b.id, b.t, b.uid, b.keyword, b.url_rank, b.click_num, b.click_url from bigtable a join bigtable2 b on a.id = b.id;
create table bigtable_buck1( id bigint, t bigint, uid string, keyword string, url_rank int, click_num int, click_url string) clustered by(id) sorted by(id) into 6 buckets row format delimited fields terminated by '\t'; load data local inpath '/opt/module/data/bigtable' into table bigtable_buck1;
create table bigtable_buck2( id bigint, t bigint, uid string, keyword string, url_rank int, click_num int, click_url string) clustered by(id) sorted by(id) into 6 buckets row format delimited fields terminated by '\t'; load data local inpath '/opt/module/data/bigtable' into table bigtable_buck2;
set hive.optimize.bucketmapjoin = true; set hive.optimize.bucketmapjoin.sortedmerge = true; set hive.input.format=org.apache.hadoop.hive.ql.io.BucketizedHiveInputFormat;
insert overwrite table jointable select b.id, b.t, b.uid, b.keyword, b.url_rank, b.click_num, b.click_url from bigtable_buck1 s join bigtable_buck2 b on b.id = s.id;
测试结果:Time taken: 96.226 secondsapache
set hive.map.aggr = true;
set hive.groupby.mapaggr.checkinterval = 100000;
set hive.groupby.skewindata = true;
set hive.exec.reducers.bytes.per.reducer = 256000000;
set hive.exec.reducers.max = 1009;
N=min(参数 2,总输入数据量/参数 1)(参数 2 指的是上面的 1009,参数 1 值得是 256M)
set mapreduce.job.reduces = 15;
# join 的键对应的记录条数超过这个值则会进行分拆,值根据具体数据量设置 set hive.skewjoin.key=100000; # 若是是 join 过程出现倾斜应该设置为 true set hive.optimize.skewjoin=false;
set hive.skewjoin.mapjoin.map.tasks=10000;
select count(*) from emp;
set mapreduce.input.fileinputformat.split.maxsize=100;
select count(*) from emp;
set hive.input.format=org.apache.hadoop.hive.ql.io.CombineHiveInputFormat;
#在 map-only 任务结束时合并小文件,默认 true set hive.merge.mapfiles = true;
#在 map-reduce 任务结束时合并小文件,默认 false set hive.merge.mapredfiles = true;
#合并文件的大小,默认 256M set hive.merge.size.per.task = 268435456;
#当输出文件的平均大小小于该值时,启动一个独立的 map-reduce 任务进行文件 merge set hive.merge.smallfiles.avgsize = 16777216;
#至关于 map 端执行 combiner set hive.map.aggr=true;
#默认是 true set mapred.map.tasks.speculative.execution = true;
set hive.exec.reducers.bytes.per.reducer = 256000000;
set hive.exec.reducers.max = 1009;
N=min(参数 2,总输入数据量/参数 1)(参数 2 指的是上面的 1009,参数 1 值得是 256M)
set mapreduce.job.reduces = 15;
mapred.reduce.tasks.speculative.execution (hadoop 里面的)
hive.mapred.reduce.tasks.speculative.execution(hive 里面相同的参数,效果和hadoop 里面的同样两个随便哪一个都行)
<property> <name>hive.fetch.task.conversion</name> <value>more</value> <description> Expects one of [none, minimal, more]. Some select queries can be converted to single FETCH task minimizing latency. Currently the query should be single sourced not having any subquery and should not have any aggregations or distincts (which incurs RS), lateral views and joins. 0. none : disable hive.fetch.task.conversion 1. minimal : SELECT STAR, FILTER on partition columns, LIMIT only 2. more : SELECT, FILTER, LIMIT only (support TABLESAMPLE and virtual columns) </description> </property>
set hive.fetch.task.conversion=none;
select * from emp;
select empname from emp;
select empname from emp limit 3;
set hive.fetch.task.conversion=more;
select * from emp;
select empname from emp;
select empname from emp limit 3;
set hive.exec.mode.local.auto=true; //开启本地 mr //设置 local mr 的最大输入数据量,当输入数据量小于这个值时采用 local mr 的方式,默认为 134217728,即 128M set hive.exec.mode.local.auto.inputbytes.max=50000000; //设置 local mr 的最大输入文件个数,当输入文件个数小于这个值时采用 local mr 的方式,默认为 4 set hive.exec.mode.local.auto.input.files.max=10;
set hive.exec.mode.local.auto=true;
select * from emp cluster by deptno;
Time taken: 1.443 seconds, Fetched: 3 row(s)
set hive.exec.mode.local.auto=false;
select * from emp cluster by deptno;
Time taken: 19.493 seconds, Fetched: 3 row(s)
set hive.exec.parallel=true; //打开任务并行执行,默认为 false set hive.exec.parallel.thread.number=16; //同一个 sql 容许最大并行度,默认为 8
yarn.nodemanager.resource.memory-mb*(spark.executor.cores/yarn.nodemanager.resource.cpu-vcores)
set spark.executor.memory=11.2g; set spark.yarn.executor.memoryOverhead=2.8g;
set hive.execution.engine=spark; set spark.executor.memory=11.2g; set spark.yarn.executor.memoryOverhead=2.8g; set spark.executor.cores=4; set spark.executor.instances=40; set spark.dynamicAllocation.enabled=true; set spark.serializer=org.apache.spark.serializer.KryoSerializer;