其他分享
首页 > 其他分享> > 42 分区表、分桶表、函数

42 分区表、分桶表、函数

作者:互联网

分区表

分区表的定义

分区表的基本操作

二级分区

创建二级分区表

create table dept_partition2(
    deptno int, dname string, loc string
)
partitioned by (day string, hour string)
row format delimited fields terminated by '\t';

正常的加载数据

# 加载数据到二级分区表中
load data local inpath '/opt/module`/hive/datas/dept_20200401.log' into table
dept_partition2 partition(day='20200401', hour='12');

# 查询分区数据
select * from dept_partition2 where day='20200401' and hour='12';

把数据直接上传到分区目录上

动态分区

参数设置

# 开启动态分区功能(默认ture, 开启)
hive.exec.dynamic.partition=true

# 设置为非严格模式
# 动态分区模式,默认为strict,表示必须指定至少一个分区为静态分区
# nonstrict模式表示允许所有的分区字段都可以使用动态分区
hive.exec.dynamic.partition.mode=nonstrict

# 在所有执行MR的节点上,最大一共可以创建多少个动态分区,默认为1000
hive.exec.max.dynamic.partitions=1000

# 在每个执行MR的节点上,最大可以创建多少个动态分区
# day字段有365个值,那么该参数就需要设置成大于365,如果使用默认值100,则会报错
hive.exec.max.dynamic.partitions.pernode=100

# 整个MR Job中,最大可以创建多少个HDFS文件。默认100000
hive.exec.max.created.files=100000

# 当有空分区生成时,是否抛出异常。一般不需要设置。默认false
hive.error.on.empty.partition=false

案例实操

# 创建目标分区表
create table dept_partition_dy(id int, name string) partitioned by (loc int) row format delimited fields terminated by '\t';

# 设置动态分区
set hive.exec.dynamic.partition.mode = nonstrict;

insert into table dept_partition_dy partition(loc) select deptno, dname, loc from dept;

# 查看目标分区表的分区情况
show partitions dept_partition;

分桶表

数据准备

1001	ss1
1002	ss2
1003	ss3
1004	ss4
1005	ss5
1006	ss6
1007	ss7
1008	ss8
1009	ss9
1010	ss10
1011	ss11
1012	ss12
1013	ss13
1014	ss14
1015	ss15
1016	ss16

创建分桶表

create table stu_bucket(id int, name string)
clustered by(id) 
into 4 buckets
row format delimited fields terminated by '\t';

导入数据到分桶表中,load方式

load data inpath   '/student.txt' into table stu_bucket;

insert方式将数据导入分桶表

insert into table stu_buck select * from student_insert;

查询分桶的数据

select * from stu_buck;

分桶规则

分桶表操作注意事项

抽样查询

select * from stu_buck tablesample(bucket 1 out of 4 on id);

常用内置函数

空字段赋值

case when then else end

行转列

列转行

窗口函数(开窗函数)

数据准备

jack,2017-01-01,10
tony,2017-01-02,15
jack,2017-02-03,23
tony,2017-01-04,29
jack,2017-01-05,46
jack,2017-04-06,42
tony,2017-01-07,50
jack,2017-01-08,55
mart,2017-04-08,62
mart,2017-04-09,68
neil,2017-05-10,12
mart,2017-04-11,75
neil,2017-06-12,80
mart,2017-04-13,94

需求

# 查询在2017年4月份购买过的顾客及总人数
# 查询顾客的购买明细及月购买总额
# 上述的场景, 将每个顾客的cost按照日期进行累加
# 查询每个顾客上次的购买时间
# 查询前20%时间的订单信息

创建hive表并导入数据

create table business(
name string, 
orderdate string,
cost int
) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',';

load data local inpath "/opt/module/hive/datas/business.txt" into table business;

按需求查询数据

# 查询在2017年4月份购买过的顾客及总人数
select name,count(*) over ()
from business
where substring(orderdate,1,7) = '2017-04'
group by name;

# 查询顾客的购买明细及月购买总额
select name,orderdate,cost,sum(cost) 
over(partition by month(orderdate)) from
business;

# 将每个顾客的cost按照日期进行累加
select name,orderdate,cost, 
sum(cost) over() as sample1,--所有行相加 
sum(cost) over(partition by name) as sample2,--按name分组,组内数据相加 
sum(cost) over(partition by name order by orderdate) as sample3,--按name分组,组内数据累加 
sum(cost) over(partition by name order by orderdate rows between UNBOUNDED PRECEDING and current row ) as sample4 ,--和sample3一样,由起点到当前行的聚合 
sum(cost) over(partition by name order by orderdate rows between 1 PRECEDING and current row) as sample5, --当前行和前面一行做聚合 
sum(cost) over(partition by name order by orderdate rows between 1 PRECEDING AND 1 FOLLOWING ) as sample6,--当前行和前边一行及后面一行 
sum(cost) over(partition by name order by orderdate rows between current row and UNBOUNDED FOLLOWING ) as sample7 --当前行及后面所有行 
from business;

# 查看顾客上次的购买时间
select name,orderdate,cost, 
lag(orderdate,1,'1900-01-01') over(partition by name order by orderdate ) as time1, lag(orderdate,2) over (partition by name order by orderdate) as time2 
from business;

# 查询前20%时间的订单信息
select * from (
    select name,orderdate,cost, ntile(5) over(order by orderdate) sorted
    from business
) t
where sorted = 1;

Rank

数据准备

name subject score
孙悟空 语文 87
孙悟空 数学 95
孙悟空 英语 68
大海 语文 94
大海 数学 56
大海 英语 84
宋宋 语文 64
宋宋 数学 86
宋宋 英语 84
婷婷 语文 65
婷婷 数学 85
婷婷 英语 78

需求

创建Hive表并导入数据

create table score(
name string,
subject string, 
score int) 
row format delimited fields terminated by "\t";
load data local inpath '/opt/module/hive/datas/score.txt' into table score;

按需求查询数据

select name,
subject,
score,
rank() over(partition by subject order by score desc) rp,
dense_rank() over(partition by subject order by score desc) drp,
row_number() over(partition by subject order by score desc) rmp
from score;

标签:name,分桶,partition,42,hive,dept,分区表,table,day
来源: https://www.cnblogs.com/19BigData/p/15861494.html