若在Hive中执行INSERT OVERWRITE重写同一个表的数据时,有可能会形成数据丢失。apache
如 INSERT OVERWRITE TABLE table_name SELECT * FROM table_nameoop
1、新建一张分区表
create table test_chj_cols (id string, name string, age string) partitioned by (ds string) stored as textfile;
2、插入一条记录
insert into test_chj_cols partition (ds='20181224') values ('1','chj','18');
3、确认表数据及结构
> select * from test_chj_cols; OK test_chj_cols.id test_chj_cols.name test_chj_cols.age test_chj_cols.ds 1 chj 18 20181224 > desc formatted test_chj_cols partition (ds='20181224'); OK col_name data_type comment # col_name data_type comment id string name string age string # Partition Information # col_name data_type comment ds string # Detailed Partition Information Partition Value: [20181224] Database: hduser05db Table: test_chj_cols CreateTime: Mon Dec 24 19:35:28 CST 2018 LastAccessTime: UNKNOWN Protect Mode: None Location: hdfs://bdphdp02/user/hive/warehouse/hduser05/hduser05db.db/test_chj_cols/ds=20181224 Partition Parameters: COLUMN_STATS_ACCURATE true numFiles 1 numRows 1 rawDataSize 8 totalSize 17 transient_lastDdlTime 1545651329 # Storage Information SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe InputFormat: org.apache.hadoop.mapred.TextInputFormat OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat Compressed: No Num Buckets: -1 Bucket Columns: [] Sort Columns: [] Storage Desc Params: serialization.format 1 Time taken: 0.099 seconds, Fetched: 37 row(s)
4、在表中间新增字段
alter table test_chj_cols replace columns (id string, name string, money string, age string); > desc formatted test_chj_cols; OK col_name data_type comment # col_name data_type comment id string name string money string age string # Partition Information # col_name data_type comment ds string # Detailed Table Information Database: hduser05db Owner: hadoop CreateTime: Mon Dec 24 19:34:46 CST 2018 LastAccessTime: UNKNOWN Protect Mode: None Retention: 0 Location: hdfs://bdphdp02/user/hive/warehouse/hduser05/hduser05db.db/test_chj_cols Table Type: MANAGED_TABLE Table Parameters: last_modified_by hadoop last_modified_time 1545651722 transient_lastDdlTime 1545651722 # Storage Information SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe InputFormat: org.apache.hadoop.mapred.TextInputFormat OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat Compressed: No Num Buckets: -1 Bucket Columns: [] Sort Columns: [] Storage Desc Params: serialization.format 1 Time taken: 0.051 seconds, Fetched: 36 row(s)
5、重写数据
insert overwrite table test_chj_cols partition (ds='20181224') select id,name,age,name fromspa
test_chj_cols;
6、age字段数据丢失
> select * from test_chj_cols; OK test_chj_cols.id test_chj_cols.name test_chj_cols.age test_chj_cols.money test_chj_cols.ds 1 chj NULL NULL 20181224