数据分析--基础指标多维统计
作者:互联网
1.1. 多维度统计PV总量
--第一种方式:直接在dw_weblog_detail单表上进行查询
1.1.1 计算该处理批次(一天)中的各小时pvs
drop table dw_pvs_everyhour_oneday;
create table dw_pvs_everyhour_oneday(month string,day string,hour string,pvs bigint) partitioned by(datestr string);
insert into table dw_pvs_everyhour_oneday partition(datestr='20181101')
select month ,day,hour,count(*) as pvs from dw_weblog_detail
where datestr='20181101'
group by month,day,hour;
select * from dw_pvs_everyhour_oneday ;
+--------------------------------+------------------------------+-------------------------------+------------------------------+----------------------------------+--+
| dw_pvs_everyhour_oneday.month | dw_pvs_everyhour_oneday.day | dw_pvs_everyhour_oneday.hour | dw_pvs_everyhour_oneday.pvs | dw_pvs_everyhour_oneday.datestr |
+--------------------------------+------------------------------+-------------------------------+------------------------------+----------------------------------+--+
| 11 | 01 | 06 | 222 | 20181101 |
| 11 | 01 | 07 | 2020 | 20181101 |
| 11 | 01 | 08 | 4104 | 20181101 |
| 11 | 01 | 09 | 2748 | 20181101 |
| 11 | 01 | 10 | 1136 | 20181101 |
| 11 | 01 | 11 | 1142 | 20181101 |
| 11 | 01 | 12 | 1242 | 20181101 |
| 11 | 01 | 13 | 1062 | 20181101 |
| 11 | 01 | 14 | 1028 | 20181101 |
| 11 | 01 | 15 | 1518 | 20181101 |
| 11 | 01 | 16 | 950 | 20181101 |
| 11 | 01 | 17 | 764 | 20181101 |
| 11 | 01 | 18 | 524 | 20181101 |
| 11 | 01 | 19 | 780 | 20181101 |
| 11 | 01 | 20 | 422 | 20181101 |
| 11 | 01 | 21 | 426 | 20181101 |
| 11 | 01 | 22 | 702 | 20181101 |
| 11 | 01 | 23 | 764 | 20181101 |
| 11 | 02 | 00 | 624 | 20181101 |
| 11 | 02 | 01 | 648 | 20181101 |
| 11 | 02 | 02 | 1092 | 20181101 |
| 11 | 02 | 03 | 1104 | 20181101 |
| 11 | 02 | 04 | 1138 | 20181101 |
| 11 | 02 | 05 | 1080 | 20181101 |
| 11 | 02 | 06 | 300 | 20181101 |
+--------------------------------+------------------------------+-------------------------------+------------------------------+----------------------------------+--+
--计算每天的pvs
drop table dw_pvs_everyday;
create table dw_pvs_everyday(pvs bigint,month string,day string);
insert into table dw_pvs_everyday
select count(*) as pvs,a.month as month,a.day as day from dw_weblog_detail a
group by a.month,a.day;
select * from dw_pvs_everyday;
+--------+--------+------+--+
| aspvs | month | day |
+--------+--------+------+--+
| 21554 | 11 | 01 |
| 5986 | 11 | 02 |
+--------+--------+------+--+
delete from dw_pvs_everyday where day='01' and day='02'; (注意hive里面不支持更新和删除
--Error: Error while compiling statement: FAILED: SemanticException [Error 10294]:
--Attempt to do update or delete using transaction manager that does not support these operations. (state=42000,code=10294)
)
--维度:月
drop table dw_pvs_everymonth;
desc dw_pvs_everymonth;
create table dw_pvs_everymonth(pvs bigint,month string);
--所需技术:
--常用的join有四种,inner join, left outer join, right outer join, full outer join
INNER JOIN
跟JOIN是一样的,一般INNER关键字可以省略。
INNER JOIN将只会返回相匹配的元素项,即不会返回结果为NULL的数据项。
left join
--左连接,返回所有的记录,即使在右表中没有匹配的行。
right outer join
--右连接,返回右表中的所有记录,即使在左表中没有记录与它匹配
full outer join
--全连接,返回左右表中的所有记录
--插入
insert into table dw_pvs_everymonth
select count(*) as pvs ,a.month as month from
(select distinct(month) from t_dim_time) a join dw_weblog_detail b
on a.month=b.month group by a.month;
--查看
select * from dw_pvs_everymonth;
+------------------------+--------------------------+--+
| dw_pvs_everymonth.pvs | dw_pvs_everymonth.month |
+------------------------+--------------------------+--+
| 27540 | 11 |
+------------------------+--------------------------+--+
--(1.2)统计每小时各来访url产生的pv量,查询结果存入:( "dw_pvs_referer_everyhour" )
drop table dw_pvs_referer_everyhour;
desc dw_pvs_referer_everyhour;
+--------------------------+-----------------------+-----------------------+--+
| col_name | data_type | comment |
+--------------------------+-----------------------+-----------------------+--+
| referer_url | string | |
| referer_host | string | |
| month | string | |
| day | string | |
| hour | string | |
| pv_referer_cnt | bigint | |
| datestr | string | |
| | NULL | NULL |
| # Partition Information | NULL | NULL |
| # col_name | data_type | comment |
| | NULL | NULL |
| datestr | string | |
+--------------------------+-----------------------+-----------------------+--+
create table dw_pvs_referer_everyhour(referer_url string,referer_host string,
month string,day string,hour string,pv_referer_cnt bigint) partitioned by(datestr string);
表
+-------------------------+-------------------------------+-------------------------------+------------------------------+--------------------------+---------------------------+-------------------------+-----------------------+------------------------+-----------------------------------------------+--------------------------+-----------------------------------+------------------------------------------------+----------------------------+-------------------------------+-----------------------------+--------------------------------+----------------------------------------------------+---------------------------+--+
| dw_weblog_detail.valid | dw_weblog_detail.remote_addr | dw_weblog_detail.remote_user | dw_weblog_detail.time_local | dw_weblog_detail.daystr | dw_weblog_detail.timestr | dw_weblog_detail.month | dw_weblog_detail.day | dw_weblog_detail.hour | dw_weblog_detail.request | dw_weblog_detail.status | dw_weblog_detail.body_bytes_sent | dw_weblog_detail.http_referer | dw_weblog_detail.ref_host | dw_weblog_detail.ref_path | dw_weblog_detail.ref_query | dw_weblog_detail.ref_query_id | dw_weblog_detail.http_user_agent | dw_weblog_detail.datestr |
+-------------------------+-------------------------------+-------------------------------+------------------------------+--------------------------+---------------------------+-------------------------+-----------------------+------------------------+-----------------------------------------------+--------------------------+-----------------------------------+------------------------------------------------+----------------------------+-------------------------------+-----------------------------+--------------------------------+----------------------------------------------------+---------------------------+--+
| false | 194.237.142.21 | - | 2018-11-01 06:49:18 | 2018-11-01 | 06:49:18 | 11 | 01 | 06 | /wp-content/uploads/2013/07/rstudio-git3.png | 304 | 0 | "-" | NULL | NULL | NULL | NULL | "Mozilla/4.0(compatible;)" | 20181101 |
| false | 163.177.71.12 | - | 2018-11-01 06:49:33 | 2018-11-01 | 06:49:33 | 11 | 01 | 06 | / | 200 | 20 | "-" | NULL | NULL | NULL | NULL | "DNSPod-Monitor/1.0" | 20181101 |
| false | 163.177.71.12 | - | 2018-11-01 06:49:36 | 2018-11-01 | 06:49:36 | 11 | 01 | 06 | / | 200 | 20 | "-" | NULL | NULL | NULL | NULL | "DNSPod-Monitor/1.0" | 20181101 |
| false | 101.226.68.137 | - | 2018-11-01 06:49:42 | 2018-11-01 | 06:49:42 | 11 | 01 | 06 | / | 200 | 20 | "-" | NULL | NULL | NULL | NULL | "DNSPod-Monitor/1.0" | 20181101 |
| false | 101.226.68.137 | - | 2018-11-01 06:49:45 | 2018-11-01 | 06:49:45 | 11 | 01 | 06 | / | 200 | 20 | "-" | NULL | NULL | NULL | NULL | "DNSPod-Monitor/1.0" | 20181101 |
| false | 60.208.6.156 | - | 2018-11-01 06:49:48 | 2018-11-01 | 06:49:48 | 11 | 01 | 06 | /wp-content/uploads/2013/07/rcassandra.png | 200 | 185524 | "http://cos.name/category/software/packages/" | cos.name | /category/software/packages/ | NULL | NULL | "Mozilla/5.0(WindowsNT6.1)AppleWebKit/537.36(KHTML,likeGecko)Chrome/29.0.1547.66Safari/537.36" | 20181101 |
| false | 222.68.172.190 | - | 2018-11-01 06:49:57 | 2018-11-01 | 06:49:57 | 11 | 01 | 06 | /images/my.jpg | 200 | 19939 | "http://www.angularjs.cn/A00n" | www.angularjs.cn | /A00n | NULL | NULL | "Mozilla/5.0(WindowsNT6.1)AppleWebKit/537.36(KHTML,likeGecko)Chrome/29.0.1547.66Safari/537.36" | 20181101 |
| false | 183.195.232.138 | - | 2018-11-01 06:50:16 | 2018-11-01 | 06:50:16 | 11 | 01 | 06 | / | 200 | 20 | "-" | NULL | NULL | NULL | NULL | "DNSPod-Monitor/1.0" | 20181101 |
| false | 183.195.232.138 | - | 2018-11-01 06:50:16 | 2018-11-01 | 06:50:16 | 11 | 01 | 06 | / | 200 | 20 | "-" | NULL | NULL | NULL | NULL | "DNSPod-Monitor/1.0" | 20181101 |
| false | 66.249.66.84 | - | 2018-11-01 06:50:28 | 2018-11-01 | 06:50:28 | 11 | 01 | 06 | /page/6/ | 200 | 27777 | "-" | NULL | NULL | NULL | NULL | "Mozilla/5.0(compatible;Googlebot/2.1;+http://www.google.com/bot.html)" | 20181101 |
+-------------------------+-------------------------------+-------------------------------+------------------------------+--------------------------+---------------------------+-------------------------+-----------------------+------------------------+-----------------------------------------------+--------------------------+-----------------------------------+------------------------------------------------+----------------------------+-------------------------------+-----------------------------+--------------------------------+----------------------------------------------------+---------------------------+--+
--统计每小时各来访url产生的pv量,查询结果存入:( "dw_pvs_referer_everyhour" )
insert into table dw_pvs_referer_everyhour partition(datestr='20181101')
select http_referer,ref_host,month,day,hour, count(1) as pv_referer_cnt
from dw_weblog_detail
group by http_referer,ref_host,month,day,hour having ref_host is not null
order by month,day,hour,pv_referer_cnt
limit 10;
+----------------------------------------------------+-------------------+--------+------+-------+-----------------+--+
| http_referer | ref_host | month | day | hour | pv_referer_cnt |
+----------------------------------------------------+-------------------+--------+------+-------+-----------------+--+
| "http://blog.fens.me/series-nodejs/" | blog.fens.me | 11 | 01 | 06 | 2 |
| "http://www.angularjs.cn/" | www.angularjs.cn | 11 | 01 | 06 | 2 |
| "http://www.google.com/url?sa=t&rct=j&q=nodejs%20%E5%BC%82%E6%AD%A5%E5%B9%BF%E6%92%AD&source=web&cd=1&cad=rja&ved=0CCgQFjAA&url=%68%74%74%70%3a%2f%2f%62%6c%6f%67%2e%66%65%6e%73%2e%6d%65%2f%6e%6f%64%65%6a%73%2d%73%6f%63%6b%65%74%69%6f%2d%63%68%61%74%2f&ei=rko5UrylAefOiAe7_IGQBw&usg=AFQjCNG6YWoZsJ_bSj8kTnMHcH51hYQkAA&bvm=bv.52288139,d.aGc" | www.google.com | 11 | 01 | 06 | 2 |
| "http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=6&cad=rja&ved=0CHIQFjAF&url=http%3A%2F%2Fblog.fens.me%2Fvps-ip-dns%2F&ei=j045UrP5AYX22AXsg4G4DQ&usg=AFQjCNGsJfLMNZnwWXNpTSUl6SOEzfF6tg&sig2=YY1oxEybUL7wx3IrVIMfHA&bvm=bv.52288139,d.b2I" | www.google.com | 11 | 01 | 06 | 2 |
| "http://cos.name/category/software/packages/" | cos.name | 11 | 01 | 06 | 2 |
| "http://www.baidu.com/s?tn=baiduhome_pg&ie=utf-8&bs=%E5%9C%A8linux%E5%90%AF%E5%8A%A8%E4%B8%8Bmongodb.conf&f=8&rsv_bp=1&wd=about+to+fork+child+process%2C+waiting+until+server+is+ready+for+connections.&rsv_n=2&rsv_sug3=1&rsv_sug1=1&rsv_sug4=187&inputT=906" | www.baidu.com | 11 | 01 | 06 | 2 |
| "http://www.angularjs.cn/A00n" | www.angularjs.cn | 11 | 01 | 06 | 4 |
| "http://blog.fens.me/nodejs-express3/" | blog.fens.me | 11 | 01 | 06 | 4 |
| "http://blog.fens.me/nodejs-async/" | blog.fens.me | 11 | 01 | 06 | 10 |
| "http://blog.fens.me/wp-content/themes/silesia/style.css" | blog.fens.me | 11 | 01 | 06 | 14 |
+----------------------------------------------------+-------------------+--------+------+-------+-----------------+--+
insert into table dw_pvs_referer_everyhour partition(datestr='20181101')
select http_referer,ref_host,month,day,hour, count(1) as pv_referer_cnt
from dw_weblog_detail
group by http_referer,ref_host,month,day,hour having ref_host is not null
order by month,day,hour,pv_referer_cnt;
select * from dw_pvs_referer_everyhour limit 5;
+----------------------------------------------------+----------------------------------------+---------------------------------+-------------------------------+--------------------------------+------------------------------------------+-----------------------------------+--+
| dw_pvs_referer_everyhour.referer_url | dw_pvs_referer_everyhour.referer_host | dw_pvs_referer_everyhour.month | dw_pvs_referer_everyhour.day | dw_pvs_referer_everyhour.hour | dw_pvs_referer_everyhour.pv_referer_cnt | dw_pvs_referer_everyhour.datestr |
+----------------------------------------------------+----------------------------------------+---------------------------------+-------------------------------+--------------------------------+------------------------------------------+-----------------------------------+--+
| "http://blog.fens.me/series-nodejs/" | blog.fens.me | 11 | 01 | 06 | 2 | 20181101 |
| "http://www.angularjs.cn/" | www.angularjs.cn | 11 | 01 | 06 | 2 | 20181101 |
| "http://www.google.com/url?sa=t&rct=j&q=nodejs%20%E5%BC%82%E6%AD%A5%E5%B9%BF%E6%92%AD&source=web&cd=1&cad=rja&ved=0CCgQFjAA&url=%68%74%74%70%3a%2f%2f%62%6c%6f%67%2e%66%65%6e%73%2e%6d%65%2f%6e%6f%64%65%6a%73%2d%73%6f%63%6b%65%74%69%6f%2d%63%68%61%74%2f&ei=rko5UrylAefOiAe7_IGQBw&usg=AFQjCNG6YWoZsJ_bSj8kTnMHcH51hYQkAA&bvm=bv.52288139,d.aGc" | www.google.com | 11 | 01 | 06 | 2 | 20181101 |
| "http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=6&cad=rja&ved=0CHIQFjAF&url=http%3A%2F%2Fblog.fens.me%2Fvps-ip-dns%2F&ei=j045UrP5AYX22AXsg4G4DQ&usg=AFQjCNGsJfLMNZnwWXNpTSUl6SOEzfF6tg&sig2=YY1oxEybUL7wx3IrVIMfHA&bvm=bv.52288139,d.b2I" | www.google.com | 11 | 01 | 06 | 2 | 20181101 |
| "http://cos.name/category/software/packages/" | cos.name | 11 | 01 | 06 | 2 | 20181101 |
+----------------------------------------------------+--------------------------------------
--统计每小时各来访host的产生的pv数并排序
drop table dw_pvs_refererhost_everyhour;
create table dw_pvs_refererhost_everyhour(ref_host string,month string,day string,hour string,ref_host_cnts bigint) partitioned by(datestr string);
desc dw_pvs_refererhost_everyhour;
+--------------------------+-----------------------+-----------------------+--+
| col_name | data_type | comment |
+--------------------------+-----------------------+-----------------------+--+
| ref_host | string | |
| month | string | |
| day | string | |
| hour | string | |
| ref_host_cnts | bigint | |
| datestr | string | |
| | NULL | NULL |
| # Partition Information | NULL | NULL |
| # col_name | data_type | comment |
| | NULL | NULL |
| datestr | string | |
+--------------------------+-----------------------+-----------------------+--+
select ref_host,month,day,hour,count(1) as ref_host_cnts
from dw_weblog_detail
group by ref_host,month,day,hour
having ref_host is not null
order by hour asc,day asc,month asc,ref_host_cnts desc limit 10;
+---------------------+--------+------+-------+----------------+--+
| ref_host | month | day | hour | ref_host_cnts |
+---------------------+--------+------+-------+----------------+--+
| blog.fens.me | 11 | 02 | 00 | 222 |
| www.fens.me | 11 | 02 | 00 | 26 |
| h2w.iask.cn | 11 | 02 | 00 | 12 |
| www.google.com.hk | 11 | 02 | 00 | 6 |
| angularjs.cn | 11 | 02 | 00 | 6 |
| cnodejs.org | 11 | 02 | 00 | 2 |
| www.leonarding.com | 11 | 02 | 00 | 2 |
| www.itpub.net | 11 | 02 | 00 | 2 |
| blog.fens.me | 11 | 02 | 01 | 178 |
| cos.name | 11 | 02 | 01 | 6 |
+---------------------+--------+------+-------+----------------+--+
insert into table dw_pvs_refererhost_everyhour partition(datestr='20181101')
select ref_host,month,day,hour,count(1) as ref_host_cnts
from dw_weblog_detail
group by ref_host,month,day,hour
having ref_host is not null
order by hour asc,day asc,month asc,ref_host_cnts desc;
select * from dw_pvs_refererhost_everyhour limit 5;
+----------------------------------------+-------------------------------------+-----------------------------------+------------------------------------+---------------------------------------------+---------------------------------------+--+
| dw_pvs_refererhost_everyhour.ref_host | dw_pvs_refererhost_everyhour.month | dw_pvs_refererhost_everyhour.day | dw_pvs_refererhost_everyhour.hour | dw_pvs_refererhost_everyhour.ref_host_cnts | dw_pvs_refererhost_everyhour.datestr |
+----------------------------------------+-------------------------------------+-----------------------------------+------------------------------------+---------------------------------------------+---------------------------------------+--+
| blog.fens.me | 11 | 02 | 00 | 222 | 20181101 |
| www.fens.me | 11 | 02 | 00 | 26 | 20181101 |
| h2w.iask.cn | 11 | 02 | 00 | 12 | 20181101 |
| www.google.com.hk | 11 | 02 | 00 | 6 | 20181101 |
| angularjs.cn | 11 | 02 | 00 | 6 | 20181101 |
+----------------------------------------+-------------------------------------+-----------------------------------+------------------------------------+---------------------------------------------+---------------------------------------+--+
标签:数据分析,11,pvs,01,20181101,指标,多维,dw,NULL 来源: https://blog.csdn.net/longyanchen/article/details/98865467