Hive--使用Python脚本实现处理
作者:互联网
--创建原始数据表:用户id、电影id、用户评分、用户的观影时间
CREATE TABLE u_data (
userid INT,
movieid INT,
rating INT,
unixtime STRING)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS TEXTFILE;
--加载数据:
load data local inpath '/export/datas/u.data' into table u_data;
--查询数据
select count(*) from u_data;
--创建新表:用户id、电影id、用户评分、用户的时间是周几
CREATE TABLE u_data_new (
userid INT,
movieid INT,
rating INT,
weekday INT)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';
--创建Python脚本实现将原始表的时间转为对应的星期几
vim /export/datas/weekday_mapper.py
import sys
import datetime
for line in sys.stdin:
line = line.strip()
userid, movieid, rating, unixtime = line.split('\t')
weekday = datetime.datetime.fromtimestamp(float(unixtime)).isoweekday()
print '\t'.join([userid, movieid, rating, str(weekday)])
--加载python脚本并将数据写入新表
add FILE /export/datas/weekday_mapper.py;
INSERT OVERWRITE TABLE u_data_new
SELECT
TRANSFORM (userid, movieid, rating, unixtime)
USING 'python weekday_mapper.py'
AS (userid, movieid, rating, weekday)
FROM u_data;
--统计每周内每天用户观影的次数
SELECT
weekday,
COUNT(*)
FROM
u_data_new
GROUP BY
weekday;
标签:rating,movieid,--,Hive,Python,weekday,data,INT 来源: https://blog.csdn.net/qq_46893497/article/details/109959851