sql-面试题

2022-08-19 22:02:28 作者：互联网

TopN问题

需要确定使用什么排名函数，包含三种函数：row_number()、rank()、dense_rank()

每个班级的分数为前3名的学生

--建表语句
create table score(sid string, class string, score int)
row format delimited fields terminated by ' ';
load data local inpath '/temp/sql.txt' into table score;

--数据：学生id、班级、分数
s1 A 89
s2 C 88
s3 A 92
s4 A 89
s5 B 90
s6 B 86
s7 C 92
s8 C 90
s9 A 85
s10 C 86
s11 B 86
s12 B 86

--sql语句
select * from (
    select class, sid, score, dense_rank() over(partition by class order by score desc) as rank
    from score
) t1
where t1.rank < 4;

--结果
t1.class	t1.sid	t1.score	t1.rank
A	s3	92	1
A	s4	89	2
A	s1	89	2
A	s9	85	3
B	s5	90	1
B	s11	86	2
B	s12	86	2
B	s6	86	2
C	s7	92	1
C	s8	90	2
C	s2	88	3

行列转换问题

（1）一行转换为多行

将每个电影的分类列表拆分出单个分类

--建表语句
create table movie(movie_name string, category string)
row format delimited fields terminated by ' ';
load data local inpath '/temp/sql.txt' into table movie;

--数据：电影名、分类
让子弹飞 动作、年代
长江七号 科幻
大进军 战争

--sql语句
select movie_name, category_name
from movie
lateral view explode(split(category, '、')) t1 as category_name;

--结果
movie_name	category_name
让子弹飞	动作
让子弹飞	年代
长江七号	科幻
大进军	战争

（2）多行转换为一行

将年龄相同的姓名合并在一起

--建表语句
create table person(name string, age int)
row format delimited fields terminated by ' ';
load data local inpath '/temp/sql.txt' into table person;

--数据：姓名、年龄
A 20
B 18
C 20
D 24

--sql语句
select concat_ws('|', collect_set(name)) as name_list, age
from person
group by age;

--结果
name_list	age
B	18
A|C	20
D	24

将可枚举的值作为新的字段

--建表语句
create table game(year int, class string, score int)
row format delimited fields terminated by ' ';
load data local inpath '/temp/sql.txt' into table game;

--数据：年份、班级、得分
2020 ClassA 10
2020 ClassB 12
2020 ClassC 9
2021 ClassA 12
2021 ClassB 8

--sql语句
select year,
max(case when class = 'ClassA' then score end) as A,
max(case when class = 'ClassB' then score end) as B,
max(case when class = 'ClassC' then score end) as C
from game
group by year;

--结果
year	a	b	c
2020	10	12	9
2021	12	8	NULL

连续性问题

连续3天登录的用户

--建表语句
create table login(uid int,	dt date, status int)
row format delimited fields terminated by ' ';
load data local inpath '/temp/sql.txt' into table login;

--数据：用户id、日期、是否登录
1	2022-03-01	0
1	2022-03-02	1
1	2022-03-03	1
1	2022-03-04	1
1	2022-03-05	1
1	2022-03-06	0
1	2022-03-07	1
2	2022-03-01	0
2	2022-03-02	1
2	2022-03-03	1
2	2022-03-04	0
2	2022-03-05	0
2	2022-03-06	1
2	2022-03-07	1
3	2022-03-01	1
3	2022-03-02	1
3	2022-03-03	1
3	2022-03-04	0
3	2022-03-05	1
3	2022-03-06	1
3	2022-03-07	1

--sql语句
select *
from (
    --查询用户在起始日期及其之后的连续登录次数
    select t1.uid, min(t1.dt) as start_day, count(*) as day_count
    from (
        --按照用户分组，按照日期递增排序，计算连续登录操作的锚定日期
        select uid, dt, date_sub(dt, row_number() over(partition by uid order by dt asc)) as diff
        from login
        where status = 1 --筛选登录的日期
    ) t1
    group by t1.uid, t1.diff --按照用户、锚定日期分组
) t2
where t2.day_count > 2; --筛选出至少3次的连续登录操作

--结果
t2.uid	t2.start_day	t2.day_count
1	2022-03-02	4
3	2022-03-01	3
3	2022-03-05	3

连续3年获得冠军的队伍

--建表语句
create table champion(year int, team string)
row format delimited fields terminated by ' ';
load data local inpath '/temp/sql.txt' into table champion;

--数据：年份、获得冠军的队伍
2000 Sun
2001 Lakers
2002 Rockets
2003 Rockets
2004 Rockets
2005 Spurs
2006 Spurs
2007 Sun
2008 Lakers
2009 Lakers
2010 Lakers
2011 Lakers
2012 Warriors
2013 Warriors
2014 Warriors
2015 Heat
2016 Warriors
2017 Cavaliers
2018 Warriors
2019 Sun
2020 Warriors
2021 Warriors
2022 Raptors

--sql语句
--查询队伍在起始年份及其之后的连续获得冠军的次数
select t1.team, min(t1.year) as start_year, count(*) as count
from (
	--按照队伍分组，按照年份递增排序，计算队伍获得冠军的锚定年份
	select year, team, year - row_number() over(partition by team order by year asc) as diff
	from champion
) t1
group by t1.team, t1.diff --按照队伍、锚定年份分组
having count > 2; --筛选出至少3次的连续获得冠军

--结果
t2.team	t2.start_year	t2.count
Lakers	2008	4
Rockets	2002	3
Warriors	2012	3

间隔连续问题

给定每天登录游戏的所有用户，返回每个用户连续登录的最长天数，间隔一天的两次登录也可以看作连续登录

--建表语句
create table login(id int, dt date)
row format delimited fields terminated by ' ';
load data local inpath '/temp/sql.txt' into table login;

--数据：用户id、登录日期
1001 2022-07-01
1002 2022-07-01
1003 2022-07-01
1001 2022-07-02
1002 2022-07-03
1001 2022-07-04
1002 2022-07-05
1003 2022-07-05
1001 2022-07-07
1002 2022-07-07
1003 2022-07-08
1001 2022-07-09
1002 2022-07-09

--sql语句
--按照用户id分组，查询每个用户连续登录的最长天数
select id, max(day_count) as day_count
from (
    --按照用户id、flag字段分组，计算登录日期的最大值、最小值之差，表示一次连续登录的天数
	select id, datediff(max(dt), min(dt)) + 1 as day_count
	from (
        --按照用户id分组，按照登录日期递增排序，判断当前登录、上次登录的日期之差是否大于2，累计求和，
        --使得属于连续登录的所有日期具有相同取值的flag字段
		select id, dt,
		sum(if(diff > 2, 1, 0)) over(partition by id order by dt asc) as flag
		from (
            --查询当前登录、上次登录的日期之差diff
			select id, dt, 
			case when pre_dt is null then 0
			else datediff(dt, pre_dt) end as diff
			from (
                --按照用户id分组，按照登录日期递增排序，查询当前登录日期、上次登录日期
				select id, dt,
				lag(dt) over(partition by id order by dt asc) as pre_dt
				from login
			) t1
		) t2
	) t3
	group by id, flag
) t4
group by id;

--结果
id	day_count
1001	4
1002	9
1003	1

波峰波谷问题

股票价格在时间点上的波峰与波谷

--建表语句
create table price(stock string, time string, price int)
row format delimited fields terminated by ' ';
load data local inpath '/temp/sql.txt' into table price;

--数据：股票id、时间点、价格
A1 06:00 12
A1 09:00 16
A1 12:00 24
A1 15:00 17
A1 18:00 11
A1 21:00 13
B1 06:00 18
B1 09:00 12
B1 12:00 13
B1 15:00 13
B1 18:00 15
B1 21:00 17
C1 06:00 12
C1 09:00 13
C1 12:00 15
C1 15:00 17
C1 18:00 18
C1 21:00 20

--sql语句
select * from (
    --查询波峰点
    select t1.stock, t1.time, t1.price, 'top' as top
    from (
        select stock, time, price,
        --按照股票分组，按照时间点递增排序
        lag(price) over(partition by stock order by time asc) as previous, --前面时间点的价格
        lead(price) over(partition by stock order by time asc) as next --后面时间点的价格
        from price
    ) t1
    where t1.price > t1.previous and t1.price > t1.next --筛选出波峰点

    union

    --查询波谷点
    select t2.stock, t2.time, t2.price, 'down' as top
    from (
        select stock, time, price,
        lag(price) over(partition by stock order by time asc) as previous,
        lead(price) over(partition by stock order by time asc) as next
        from price
    ) t2
    where t2.price < t2.previous and t2.price < t2.next --筛选出波谷点
) t3;

--结果
t3.stock	t3.time	t3.price	t3.top
A1	12:00	24	top
A1	18:00	11	down
B1	09:00	12	down

浏览时长问题

给定用户在多个时间点上的点击浏览记录，如果两次点击浏览的时间间隔不超过30个单位，则两次浏览属于相同的会话。查询用户在每次会话中的浏览时长、浏览步长，步长表示点击浏览的次数

--建表语句
create table click(id string, time int)
row format delimited fields terminated by ' ';
load data local inpath '/temp/sql.txt' into table click;

--数据：用户id、点击浏览时间
a 1001
a 1005
a 1020
a 1048
a 1078
a 1230
a 1245
a 1270
a 1282
b 1101
b 1132
b 1156
b 1180
b 1200
b 1230
b 1345
b 1370
b 1400

--sql语句
--查询用户在每个会话中的起始时间点、点击浏览次数、浏览时长
select id, min(time) as start_time, count(*) as count, max(time) - min(time) as total_time
from (
    --分组排序后，从上到下计算value列的累加和。如果求和结果相同，则表示属于相同的会话
    select t2.id, t2.time, sum(t2.value) over(partition by id order by time asc) as stage
    from (
        --如果与前一次点击的时间之差超过30，则value列为1，否则为0
        --value列为1表示这次点击属于一个新的会话，为0表示这次点击与前一次属于相同的会话
        select id, time, diff, case when nvl(diff, 9999) > 30 then 1 else 0 end value
        from (
            --按照用户id分组，按照点击浏览时间递增排序，计算前后两次点击的时间之差
            select id, time, time - lag(time) over(partition by id order by time asc) as diff
            from click
        ) t1
    ) t2
) t3
group by id, stage;

--结果
id	start_time	count	total_time
a	1001	5	77
a	1230	4	52
b	1101	1	0
b	1132	5	98
b	1345	3	55

活动时长问题

每个品牌具有多个打折活动，给定每个活动的开始时间、结束时间，返回每个品牌实际参与打折的天数，重复日期不计算在内

--建表语句
create table discount(brand int, start_dt date, end_dt date)
row format delimited fields terminated by ' ';
load data local inpath '/temp/sql.txt' into table discount;

--数据：品牌id、活动开始时间、活动结束时间
1001 2022-07-01 2022-07-03
1001 2022-07-05 2022-07-10
1002 2022-07-02 2022-07-08
1002 2022-07-06 2022-07-09
1003 2022-07-12 2022-07-20
1003 2022-07-15 2022-07-18
1004 2022-07-20 2022-07-25
1004 2022-07-22 2022-07-26
1004 2022-07-28 2022-07-30

--sql语句
--按照品牌id分组，计算打折活动的不重复天数之和day_count
select brand, sum(count) as day_count
from (
    --根据当前结束时间的最大值max_dt，计算每行打折活动可以贡献的活动天数count
	select brand, start_dt, end_dt, max_dt,
	case when max_dt is null then datediff(end_dt, start_dt) + 1
	when max_dt < start_dt then datediff(end_dt, start_dt) + 1
	when max_dt < end_dt then datediff(end_dt, max_dt)
	else 0 end as count
	from (
        --按照品牌id分组，按照开始时间、结束时间递增排序，查询当前结束时间的最大值max_dt
		select brand, start_dt, end_dt,
		max(end_dt) over(partition by brand order by start_dt asc, end_dt asc rows 
                         between unbounded preceding and 1 preceding) as max_dt
		from discount
	) t1
) t2
group by brand;

--结果
brand	day_count
1001	9
1002	8
1003	9
1004	11

同时在线问题

给定每个用户在线的开始时间、结束时间，返回一个时间段与人数，这个时间段具有最多的在线人数

--建表语句
create table live(id int, start_dt date, end_dt date)
row format delimited fields terminated by ' ';
load data local inpath '/temp/sql.txt' into table live;

--数据：用户id、开始时间、结束时间
1001 2022-07-01 2022-07-02
1001 2022-07-04 2022-07-05
1001 2022-07-07 2022-07-10
1001 2022-07-13 2022-07-18
1002 2022-07-01 2022-07-02
1002 2022-07-04 2022-07-05
1002 2022-07-07 2022-07-08
1002 2022-07-10 2022-07-11
1002 2022-07-13 2022-07-14
1002 2022-07-16 2022-07-17
1002 2022-07-19 2022-07-20
1003 2022-07-01 2022-07-20
1004 2022-07-04 2022-07-08
1004 2022-07-12 2022-07-16
1005 2022-07-03 2022-07-06
1005 2022-07-09 2022-07-11
1006 2022-07-04 2022-07-06
1007 2022-07-09 2022-07-12
1008 2022-07-06 2022-07-08
1008 2022-07-11 2022-07-13
1009 2022-07-06 2022-07-08
1009 2022-07-18 2022-07-19
1010 2022-07-11 2022-07-14

--sql语句
select dt, online
from (
	select dt, 
	sum(count) over(order by dt asc) as online
	from (
		select dt, sum(value) as count
		from (
			select id, start_dt as dt, 1 as value from live
			union
			select id, date_add(end_dt, 1) as dt, -1 as value from live
		) t1
		group by dt
	) t2
) t3
order by online desc, dt asc;

--结果
dt	online
2022-07-04	6
2022-07-06	6
2022-07-07	6
2022-07-11	6
2022-07-13	6
2022-07-10	5
2022-07-12	5
2022-07-14	5
2022-07-09	4
2022-07-16	4
2022-07-01	3
2022-07-15	3
2022-07-17	3
2022-07-18	3
2022-07-19	3
2022-07-03	2
2022-07-20	2
2022-07-21	0

区间合并问题

给定多个时间段，每个时间段分为开始时间、结束时间，将相互重叠的多个时间段合并为一个区间

--建表语句
create table time_merge(id int, start_time int, end_time int)
row format delimited fields terminated by ' ';
load data local inpath '/temp/sql.txt' into table time_merge;

--数据：id、开始时间、结束时间
1 12 15
2 57 58
3 29 32
4 30 31
5 17 19
6 44 44
7 56 57
8 16 18

--sql语句
--按照区间序号进行分组，查询每个分组的最小开始时间作为区间开始时间，最大结束时间作为区间结束时间
select flag, min(start_time) as start_time, max(end_time) as end_time
from (
    --判断哪些时间段属于相同区间，flag表示时间段归属的区间序号，值相同表示属于相同区间
	select id, start_time, end_time,
	sum(count) over(order by start_time asc, end_time asc) as flag
	from (
        --根据当前结束时间的最大值max_dt进行比较，标记每个时间段是否为新的区间
		select id, start_time, end_time,
		case when max_dt is null then 1 --作为一个新的区间
		when max_dt < start_time then 1 --作为一个新的区间
		else 0 end as count --与前面的区间具有重叠
		from (
            --按照开始时间、结束时间递增排序，查询当前结束时间的最大值max_dt
			select id, start_time, end_time,
			max(end_time) over(order by start_time asc, end_time asc rows 
                               between unbounded preceding and 1 preceding) as max_dt
			from time_merge
		) t1
	) t2
) t3
group by flag
;

--结果
flag	start_time	end_time
1	12	15
2	16	19
3	29	32
4	44	44
5	56	58

共同好友问题

给定每个用户的好友列表，好友关系是互相对称的，返回任意两个用户的共同好友列表

--建表语句
create table common_friend(id string, friends string)
row format delimited fields terminated by ' ';
load data local inpath '/temp/sql.txt' into table common_friend;

--数据：用户id、好友id列表
A B,C,D
B A,C,E
C A,B,D,E,F
D A,C,F
E B,C
F C,D

--sql语句
--创建临时表，将好友关系分解为最细粒度
create table friend as
select id, friend from common_friend lateral view explode(split(friends, ',')) temp as friend;

--按照用户的两两组合进行分组，将所有的共同好友放入列表
select t1.ids, concat_ws(',', collect_list(t1.friend)) as common_friend
from (
    --将好友关系表与自身进行连接，查询每个用户是哪两个用户的共同好友
	select a.friend, concat(a.id, ',', b.id) as ids
	from friend a
	join friend b
	on a.friend = b.friend --按照共同好友进行连接
	where a.id < b.id --筛选出重复记录
) t1
group by t1.ids
;

--结果
t1.ids	common_friend
A,B	C
A,C	B,D
A,D	C
A,E	B,C
A,F	C,D
B,C	A,E
B,D	A,C
B,E	C
B,F	C
C,D	A,F
C,E	B
C,F	D
D,E	C
D,F	C
E,F	C

可能好友问题

给定每个用户的好友列表，好友关系是互相对称的，返回每个用户的可能好友。如果两个用户不是好友关系，并且两者拥有至少一个（或者两个）共同好友，则两者互相是可能好友

--建表语句
create table maybe_friend(id string, friends string)
row format delimited fields terminated by ' ';
load data local inpath '/temp/sql.txt' into table maybe_friend;

--数据：用户id、好友id列表
A B,C,D
B A,C,E
C A,B,D,E,F
D A,C,F
E B,C
F C,D

--sql语句
--创建临时表，将好友关系分解为最细粒度
with friend as (
select id, friend from common_friend lateral view explode(split(friends, ',')) temp as friend)

--将具有至少两个共同好友的临时表与好友关系表进行连接，如果临时表的两个用户是好友关系，则在好友关系表中存在对应记录，否则不存在对应记录，
--表示两者是可能好友
select t2.id1, t2.id2
from (
    --查询具有至少两个共同好友的任意两个用户
	select t1.id1, t1.id2
	from (
        --将好友关系表与自身进行连接，查询任意两个用户具有的共同好友
		select a.id as id1, b.id as id2, a.friend
		from friend a
		join friend b
		on a.friend = b.friend
		where a.id < b.id
	) t1
	group by t1.id1, t1.id2
	having count(t1.friend) >= 2
) t2
left join friend
on t2.id1 = friend.id
and t2.id2 = friend.friend
where friend.id is null --排除真实好友，筛选可能好友
;

--结果
t2.id1	t2.id2
A	E
A	F
B	D

登录行为分析

有关的统计指标包含：访问量、活跃用户、新增用户、留存用户、流失用户、沉默用户、回流用户

含义解释：（1）活跃用户，每日登录应用的用户，（2）新增用户，在当前日期第一次登录应用的用户，（3）留存用户，在当前日期登录应用的用户，并且在之前日期登录过应用，（4）流失用户，指定时间内没有登录应用的用户，（5）沉默用户，只有第一次登录应用的用户，之后没有登录过应用，（6）回流用户，在当前日期登录应用的用户，并且在之前的指定时间内没有登录过应用

角色分配：活跃、新增 a、留存 b、留存 c、流失 d、沉默 e、回流 f

--建表语句
create table login_action(uid string, login_date int)
row format delimited fields terminated by ' ';
load data local inpath '/temp/sql.txt' into table login_action;

--数据：用户id、登录日期
d 20220321
e 20220321
f 20220321

a 20220322
b 20220322
d 20220322

a 20220323
b 20220323
c 20220323

a 20220324
b 20220324
c 20220324

a 20220325
b 20220325
c 20220325
f 20220325

--活跃用户：

--新增用户：需要一张全量的用户表，一张某日的登录行为表
--将日期20220321的登录用户作为初始表，表示已知的用户，然后查询日期20220322的新增用户
--登录行为表与全量用户表进行关联，行为表中存在、用户表中不存在的记录表示这个日期的新增用户
--查询出每个日期的新增用户后，需要追加到全量的用户表中

--全量的用户表：表示用户成为新增用户的日期
create table user_add(uid string, add_date int);
insert into table user_add (select * from login_action where login_date = 20220321);

select t1.uid, t1.login_date
from login_action t1
left join user_add t2
on t1.uid = t2.uid
where t1.login_date = 20220322 --查询日期20220322的新增用户
and t2.uid is null --查询在用户表中不存在的记录，即这个日期的新增用户
;

--20220322的新增用户
t1.uid	t1.add_date
a	20220322
b	20220322

--所有日期的新增用户
user_add.uid	user_add.add_date
d	20220321
e	20220321
f	20220321
a	20220322
b	20220322
c	20220323

--留存用户：需要一张全量的用户表，一张某日的登录行为表
--登录行为表与全量用户表进行关联，行为表中的登录日期与用户表中的登录日期之差为1天，表示1日的留存用户
--使用union all合并1日、2日的留存用户

select t1.uid, t1.login_date, t2.add_date
from login_action t1
join user_add t2
on t1.uid = t2.uid
where t1.login_date = 20220324 --查询日期20220324的留存用户
and t2.add_date = (20220324 - 1) --查询1日留存用户

union all --合并1日、2日留存用户

select t1.uid, t1.login_date, t2.add_date
from login_action t1
join user_add t2
on t1.uid = t2.uid
where t1.login_date = 20220324 --查询日期20220324的留存用户
and t2.add_date = (20220324 - 2) --查询2日留存用户
;

--20220324的留存用户：1日、2日
_u1.uid	_u1.login_date	_u1.add_date
c	20220324	20220323
b	20220324	20220322
a	20220324	20220322

--流失用户：需要一张全量的登录行为表
--查询每个用户的登录日期的最大值，与当前日期之差超过2日，表示流失用户

select uid, max(login_date) as last_login
from login_action
where login_date <= 20220325 --查询日期20220325的流失用户
group by uid
having max(login_date) < (20220325 - 2) --超过2日表示流失用户
;

--20220325的流失用户
uid	last_login
d	20220322
e	20220321

--沉默用户：需要一张全量的登录行为表
--查询每个用户的登录日期的数量，只有一次登录操作，表示沉默用户

select uid, max(login_date) as once_login
from login_action
group by uid
having count(login_date) = 1 --只有一次登录操作的用户表示沉默用户
;

--20220325的沉默用户
uid	once_login
e	20220321

--回流用户：需要一张全量的登录行为表
--日期20220325的活跃用户，如果在之前的日期为流失用户，则在日期20220325为回流用户

select t1.uid, t1.login_date, t2.last_login
from (
    select uid, login_date
    from login_action
    where login_date = 20220325 --查询20220325的活跃用户
) t1
join
(
    select uid, max(login_date) as last_login
    from login_action
    where login_date < 20220325 --查询20220325之前的流失用户
    group by uid
    having max(login_date) < (20220325 - 2)
) t2
on t1.uid = t2.uid
;

--20220325的h用户
t1.uid	t1.login_date	t2.last_login
f	20220325	20220321

购买行为分析

有关的统计指标包含：点击量、下单数、支付数

点击行为分析

JSON解析

解析嵌套Json字符串

get_json_object()方法

标签：面试题,07,--,用户,t1,2022,sql,id
来源： https://www.cnblogs.com/atao-BigData/p/16603430.html