项目实战从0到1之hive(45)大数据项目之电商数仓(用三)
作者:互联网
第20章 需求九:每个用户累计访问次数
结果如下
用户 日期 小计 总计
mid1 2019-12-14 10 10
mid1 2019-02-11 12 22
mid2 2019-12-14 15 15
mid2 2019-02-11 12 27
20.1 DWS层
20.1.1 建表语句
drop table if exists dws_user_total_count_day;
create external table dws_user_total_count_day(
`mid_id` string COMMENT '设备id',
`subtotal` bigint COMMENT '每日登录小计'
)
partitioned by(`dt` string)
row format delimited fields terminated by '\t'
location '/warehouse/gmall/dws/dws_user_total_count_day';
20.1.2 导入数据
1)导入数据
insert overwrite table dws_user_total_count_day
partition(dt='2019-12-14')
select
mid_id,
count(mid_id) cm
from
dwd_start_log
where
dt='2019-12-14'
group by
mid_id;
2)查询结果
select * from dws_user_total_count_day;
20.1.3 数据导入脚本
1)创建脚本dws_user_total_count_day.sh
[kgg@hadoop102 bin]$ vim dws_user_total_count_day.sh
在脚本中填写如下内容
#!/bin/bash
# 定义变量方便修改
APP=gmall
hive=/opt/module/hive/bin/hive
hadoop=/opt/module/hadoop-2.7.2/bin/hadoop
# 如果是输入的日期按照取输入日期;如果没输入日期取当前时间的前一天
if [ -n "$1" ] ;then
do_date=$1
else
do_date=`date -d "-1 day" +%F`
fi
echo "===日志日期为 $do_date==="
sql="
insert overwrite table "$APP".dws_user_total_count_day partition(dt='$do_date')
select
mid_id,
count(mid_id) cm
from
"$APP".dwd_start_log
where
dt='$do_date'
group by
mid_id,dt;
"
$hive -e "$sql"
2)增加脚本执行权限
chmod 777 ads_user_total_count.sh
3)脚本使用
ads_user_total_count.sh 2019-02-20
4)查询结果
select * from ads_user_total_count;
5)脚本执行时间
企业开发中一般在每天凌晨30分~1点
20.2 ADS层
20.2.1 建表语句
drop table if exists ads_user_total_count;
create external table ads_user_total_count(
`mid_id` string COMMENT '设备id',
`subtotal` bigint COMMENT '每日登录小计',
`total` bigint COMMENT '登录次数总计'
)
partitioned by(`dt` string)
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_user_total_count';
20.2.2 导入数据
insert overwrite table ads_user_total_count partition(dt='2019-10-03')
select
if(today.mid_id is null, yesterday.mid_id, today.mid_id) mid_id,
today.subtotal,
if(today.subtotal is null, 0, today.subtotal) + if(yesterday.total is null, 0, yesterday.total) total
from (
select
*
from dws_user_total_count_day
where dt='2019-10-03'
) today
full join (
select
*
from ads_user_total_count
where dt=date_add('2019-10-03', -1)
) yesterday
on today.mid_id=yesterday.mid_id
20.2.3 数据导入脚本
1)创建脚本
[kgg@hadoop102 bin]$ vim ads_user_total_count.sh
在脚本中编写如下内容
#!/bin/bash
db=gmall
hive=/opt/module/hive-1.2.1/bin/hive
hadoop=/opt/module/hadoop-2.7.2/bin/hadoop
if [[ -n $1 ]]; then
do_date=$1
else
do_date=`date -d '-1 day' +%F`
fi
sql="
use gmall;
insert overwrite table ads_user_total_count partition(dt='$do_date')
select
if(today.mid_id is null, yesterday.mid_id, today.mid_id) mid_id,
today.subtotal,
if(today.subtotal is null, 0, today.subtotal) + if(yesterday.total is null, 0, yesterday.total) total
from (
select
*
from dws_user_total_count_day
where dt='$do_date'
) today
full join (
select
*
from ads_user_total_count
where dt=date_add('$do_date', -1)
) yesterday
on today.mid_id=yesterday.mid_id
"
$hive -e "$sql"
2)增加脚本执行权限
chmod 777 ads_user_total_count.sh
3)脚本使用
ads_user_total_count.sh 2019-02-20
4)查询结果
select * from ads_user_total_count;
5)脚本执行时间
企业开发中一般在每天凌晨30分~1点
第21章 需求十:新收藏用户数
新收藏用户:指的是在某天首次添加收藏的用户
21.1 DWS层建立用户日志行为宽表
考虑到后面的多个需求会同时用到多张表中的数据, 如果每次都join操作, 则影响查询的效率. 可以先提前做一张宽表, 提高其他查询的执行效率.
每个用户对每个商品的点击次数, 点赞次数, 收藏次数
21.1.1 建表语句
drop table if exists dws_user_action_wide_log;
CREATE EXTERNAL TABLE dws_user_action_wide_log(
`mid_id` string COMMENT '设备id',
`goodsid` string COMMENT '商品id',
`display_count` string COMMENT '点击次数',
`praise_count` string COMMENT '点赞次数',
`favorite_count` string COMMENT '收藏次数')
PARTITIONED BY (`dt` string)
stored as parquet
location '/warehouse/gmall/dws/dws_user_action_wide_log/'
TBLPROPERTIES('parquet.compression'='lzo');
21.1.2 导入数据
insert overwrite table dws_user_action_wide_log partition(dt='2019-12-14')
select
mid_id,
goodsid,
sum(display_count) display_count,
sum(praise_count) praise_count,
sum(favorite_count) favorite_count
from
( select
mid_id,
goodsid,
count(*) display_count,
0 praise_count,
0 favorite_count
from
dwd_display_log
where
dt='2019-12-14' and action=2
group by
mid_id,goodsid
union all
select
mid_id,
target_id goodsid,
0,
count(*) praise_count,
0
from
dwd_praise_log
where
dt='2019-12-14'
group by
mid_id,target_id
union all
select
mid_id,
course_id goodsid,
0,
0,
count(*) favorite_count
from
dwd_favorites_log
where
dt='2019-12-14'
group by
mid_id,course_id
)user_action
group by
mid_id,goodsid;
21.1.3 数据导入脚本
[kgg@hadoop102 bin]$ vi dws_user_action_wide_log.sh
[kgg@hadoop102 bin]$ chmod 777 dws_user_action_wide_log.sh
#!/bin/bash
db=gmall
hive=/opt/module/hive-1.2.1/bin/hive
hadoop=/opt/module/hadoop-2.7.2/bin/hadoop
if [[ -n $1 ]]; then
do_date=$1
else
do_date=`date -d '-1 day' +%F`
fi
sql="
use gmall;
insert overwrite table dws_user_action_wide_log partition(dt='$do_date')
select
mid_id,
goodsid,
sum(display_count) display_count,
sum(praise_count) praise_count,
sum(favorite_count) favorite_count
from
( select
mid_id,
goodsid,
count(*) display_count,
0 praise_count,
0 favorite_count
from
dwd_display_log
where
dt='$do_date' and action=2
group by
mid_id,goodsid
union all
select
mid_id,
target_id goodsid,
0,
count(*) praise_count,
0
from
dwd_praise_log
where
dt='$do_date'
group by
mid_id,target_id
union all
select
mid_id,
course_id goodsid,
0,
0,
count(*) favorite_count
from
dwd_favorites_log
where
dt='$do_date'
group by
mid_id,course_id
)user_action
group by
mid_id,goodsid;
"
$hive -e "$sql"
21.2 DWS层
使用日志数据用户行为宽表作为DWS层表
21.3 ADS层
21.3.1 建表语句
drop table if exists ads_new_favorites_mid_day;
create external table ads_new_favorites_mid_day(
`dt` string COMMENT '日期',
`favorites_users` bigint COMMENT '新收藏用户数'
)
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_new_favorites_mid_day';
21.3.2 导入数据
insert into table ads_new_favorites_mid_day
select
'2019-12-14' dt,
count(*) favorites_users
from
(
select
mid_id
from
dws_user_action_wide_log
where
favorite_count>0
group by
mid_id
having
min(dt)='2019-12-14'
)user_favorite;
21.3.3 数据导入脚本
1)创建脚本ads_new_favorites_mid_day.sh
[kgg@hadoop102 bin]$ vim ads_new_favorites_mid_day.sh
在脚本中填写如下内容
#!/bin/bash
# 定义变量方便修改
APP=gmall
hive=/opt/module/hive/bin/hive
hadoop=/opt/module/hadoop-2.7.2/bin/hadoop
# 如果是输入的日期按照取输入日期;如果没输入日期取当前时间的前一天
if [ -n "$1" ] ;then
do_date=$1
else
do_date=`date -d "-1 day" +%F`
fi
echo "===日志日期为 $do_date==="
sql="
insert into table "$APP".ads_new_favorites_mid_day
select
'$do_date' dt,
count(*) favorites_users
from
(
select
mid_id
from
"$APP".dws_user_action_wide_log
where
favorite_count>0
group by
mid_id
having
min(dt)='$do_date'
)user_favorite;
"
$hive -e "$sql"
2)增加脚本执行权限
chmod 777 ads_new_favorites_mid_day.sh
3)脚本使用
ads_new_favorites_mid_day.sh 2019-02-20
4)查询结果
select * from ads_new_favorites_mid_day;
5)脚本执行时间
企业开发中一般在每天凌晨30分~1点
第22章 需求十一:各个商品点击次数top3的用户
22.1 DWS层
使用日志数据用户行为宽表作为DWS层表
22.2 ADS层
22.2.1 建表语句
drop table if exists ads_goods_count;
create external table ads_goods_count(
`dt` string COMMENT '统计日期',
`goodsid` string COMMENT '商品',
`user_id` string COMMENT '用户',
`goodsid_user_count` bigint COMMENT '商品用户点击次数'
)
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_goods_count';
22.2.2 导入数据
insert into table ads_goods_count
select
'2019-10-03',
goodsid,
mid_id,
sum_display_count
from(
select
goodsid,
mid_id,
sum_display_count,
row_number() over(partition by goodsid order by sum_display_count desc) rk
from(
select
goodsid,
mid_id,
sum(display_count) sum_display_count
from dws_user_action_wide_log
where display_count>0
group by goodsid, mid_id
) t1
) t2
where rk <= 3
22.2.3 数据导入脚本
1)创建脚本ads_goods_count.sh
[kgg@hadoop102 bin]$ vim ads_goods_count.sh
在脚本中填写如下内容
#!/bin/bash
db=gmall
hive=/opt/module/hive/bin/hive
hadoop=/opt/module/hadoop/bin/hadoop
if [[ -n $1 ]]; then
do_date=$1
else
do_date=`date -d '-1 day' +%F`
fi
sql="
use gmall;
insert into table ads_goods_count
select
'$do_date',
goodsid,
mid_id,
sum_display_count
from(
select
goodsid,
mid_id,
sum_display_count,
row_number() over(partition by goodsid order by sum_display_count desc) rk
from(
select
goodsid,
mid_id,
sum(display_count) sum_display_count
from dws_user_action_wide_log
where display_count>0
group by goodsid, mid_id
) t1
) t2
where rk <= 3
"
$hive -e "$sql"
2)增加脚本执行权限
chmod 777 ads_goods_count.sh
3)脚本使用
ads_goods_count.sh 2019-02-20
4)查询结果
select * from ads_goods_count;
5)脚本执行时间
企业开发中一般在每天凌晨30分~1点
标签:商数,count,ads,45,mid,hive,user,date,id 来源: https://www.cnblogs.com/huanghanyu/p/14308745.html