其他分享
首页 > 其他分享> > 02数仓开发-04数仓多维模型构建

02数仓开发-04数仓多维模型构建

作者:互联网

目录

一.背景

数仓建设中经常会有多个维度灵活组合看数的需求,这种多维分析的场景一般有两种处理方式

二.维度爆炸&下游易用

“维度爆炸”指的是每增加一个维度,由于维度组合数翻倍,Cube的计算和存储量也会成倍增长。

作为典型的预计算MOLAP框架,kylin如何解决维度爆炸问题的呢

类似的,在Spark中可以对所有维度组合都进行预计算( with cube 子句),

或者出于业务或者计算存储成本考虑只对一部分维度组合进行预计算(grouping sets 子句)。

grouping sets 方式剪枝后一定程度上可以避免维度爆炸,但还有以下痛点:

三.如何优化

示例:求各个维度组合的人数

维度组合: countrycityxianzhenis_good

    select '中国' country , '北京' city , '昌平' xian ,'西北旺' zhen,1 is_good , 1000 p_num

1.grouping sets字句

8种维度组合

select
    nvl(country,-1000)  as country
    ,nvl(is_good,-1000) as is_good
    ,nvl(xian,-1000)    as xian
    ,nvl(city,-1000)    as city
    ,nvl(zhen,-1000)    as zhen
    ,sum(p_num)
from
(
    select '中国' country , '北京' city , '昌平' xian ,'西北旺' zhen,1 is_good , 1000 p_num
    -- union all
    -- select '中国' country , '河南' city , '光山' xian ,'十里' zhen,1 is_good , 50 p_num
) t 
group by country,city,xian,zhen,is_good
grouping sets (
    (country)
    ,(country,city)
    ,(country,xian)
    ,(country,xian,zhen)
    ,(country,is_good)
    ,(country,city,is_good)
    ,(country,xian,is_good)
    ,(country,xian,zhen,is_good)
)

运行结果:8条

country is_good xian city zhen _c5
中国 1 昌平 -1000 -1000 1000
中国 -1000 -1000 -1000 -1000 1000
中国 1 -1000 -1000 -1000 1000
中国 1 昌平 -1000 西北旺 1000
中国 -1000 -1000 北京 -1000 1000
中国 -1000 昌平 -1000 -1000 1000
中国 1 -1000 北京 -1000 1000
中国 -1000 昌平 -1000 西北旺 1000

2.lateral view + 自定义维度list

:8种维度组合

通过lateral view 维度list 对数据做笛卡尔积,每条数据膨胀成8条维度组合不同的数据。

select
    dims
    ,country
    ,is_good
    ,xian
    ,city
    ,zhen
    ,sum(p_num)
from
(
    select
        dims
        ,country --必选维度
        ,is_good
        ,if(find_in_set('县城', dims) > 0, xian, -10000) as xian
        ,if(find_in_set('城市', dims) > 0, city, -10000) as city
        ,if(find_in_set('城镇', dims) > 0, zhen, -10000) as zhen
        ,p_num
    from
    (   select
            array(is_good,-1000) as is_good_list
            ,country
            ,city
            ,xian
            ,zhen
            ,p_num
        from
        (
            select  '中国' country , '北京' city , '昌平' xian ,'西北旺' zhen,1 is_good , 1000 p_num
            -- union all
            -- select '中国' country , '河南' city , '光山' xian ,'十里' zhen,1 is_good , 50 p_num
        )t1
    )t2
    lateral view  explode(is_good_list) t1 as is_good
    lateral view  explode(array( '国家'
                                ,'国家,县城'
                                ,'国家,城市'
                                ,'国家,城市,城镇'
    )) t2 as dims
) tmp
group by dims,country,is_good,xian,city,zhen

运行结果:8条

dims country is_good xian city zhen SUM(p_num)
国家 中国 1 -10000 -10000 -10000 1000
国家 中国 -1000 -10000 -10000 -10000 1000
国家_县城 中国 1 昌平 -10000 -10000 1000
国家_县城 中国 -1000 昌平 -10000 -10000 1000
国家_城市 中国 1 -10000 北京 -10000 1000
国家_城市 中国 -1000 -10000 北京 -10000 1000
国家城市城镇 中国 1 -10000 北京 西北旺 1000
国家城市城镇 中国 -1000 -10000 北京 西北旺 1000

3.通过配置文件,维护维度list

配置表:conf_table

select
    `version`
    ,dim1
    ,dim2
    ,dim3
    ,dim4
    ,dt
from (
    select 'v1' as `version`,'国家' as dim1,'' as dim2 , '' dim3, '' dim4, '2022-07-05' as dt
    union all
    select 'v1' as `version`,'国家' as dim1,'县城' as dim2 , '' dim3, '' dim4, '2022-07-05' as dt
    union all
    select 'v1' as `version`,'国家' as dim1,'城市' as dim2 , '' dim3, '' dim4, '2022-07-05' as dt
    union all
    select 'v1' as `version`,'国家' as dim1,'城市' as dim2 , '城镇' dim3, '' dim4, '2022-07-05' as dt
) conf_table 

sql

select
    dims
    ,country
    ,is_good
    ,xian
    ,city
    ,zhen
    ,sum(p_num)
from
(
    select
        dims
        ,country --必选维度
        ,is_good
        ,if(find_in_set('县城', dims) > 0, xian, -10000) as xian
        ,if(find_in_set('城市', dims) > 0, city, -10000) as city
        ,if(find_in_set('城镇', dims) > 0, zhen, -10000) as zhen
        ,p_num
    from
    (   select
            array(is_good,-1000) as is_good_list
            ,country
            ,city
            ,xian
            ,zhen
            ,p_num
        from
        (
            select  '中国' country , '北京' city , '昌平' xian ,'西北旺' zhen,1 is_good , 1000 p_num
        )t1
    )t2
  	--配置表
    left join (
                select
                    concat_ws(',', array(dim1, dim2, dim3, dim4)) as dims
                from (
                    select 'v1' as `version`,'国家' as dim1,'' as dim2 , '' dim3, '' dim4, '2022-07-05' as dt
                    union all
                    select 'v1' as `version`,'国家' as dim1,'县城' as dim2 , '' dim3, '' dim4, '2022-07-05' as dt
                    union all
                    select 'v1' as `version`,'国家' as dim1,'城市' as dim2 , '' dim3, '' dim4, '2022-07-05' as dt
                    union all
                    select 'v1' as `version`,'国家' as dim1,'城市' as dim2 , '城镇' dim3, '' dim4, '2022-07-05' as dt
                ) conf_table
                where version = 'v1' and dt = '2022-07-05'
    ) conf_table
    on 1 = 1 --笛卡尔积
    lateral view  explode(is_good_list) tmp as is_good
) tmp
group by dims,country,is_good,xian,city,zhen
;
dims country is_good xian city zhen sum(p_num)
国家,县城,, 中国 -1000 昌平 -10000 -10000 1000
国家,县城,, 中国 1 昌平 -10000 -10000 1000
国家,城市,城镇, 中国 -1000 -10000 北京 西北旺 1000
国家,城市,, 中国 1 -10000 北京 -10000 1000
国家,,, 中国 -1000 -10000 -10000 -10000 1000
国家,城市,, 中国 -1000 -10000 北京 -10000 1000
国家,,, 中国 1 -10000 -10000 -10000 1000
国家,城市,城镇, 中国 1 -10000 北京 西北旺 1000

标签:02,数仓,good,10000,04,country,xian,维度,1000
来源: https://www.cnblogs.com/wh984763176/p/16450257.html