1、数据准备apache
一、users.dat
数据格式:2::M::56::16::70072,
数据条数:共有6040条数据
对应字段:UserID BigInt, Gender String, Age Int, Occupation String, Zipcode String
字段解释:用户id,性别,年龄,职业,邮政编码oop
二、movies.dat
数据格式:2::Jumanji (1995)::Adventure|Children’s|Fantasy,
数据条数:共有3883条数据
对应字段:MovieID BigInt, Title String, Genres String
字段解释:电影ID,电影名字,电影类型编码
三、ratings.dat
数据格式:1::1193::5::978300760,
数据条数:共有1000209条数据
对应字段:UserID BigInt, MovieID BigInt, Rating Double, Timestamped String
字段解释:用户ID,电影ID,评分,评分时间戳spa
2、需求分析3d
一、正确建表,导入数据(三张表,三份数据),并验证是否正确code
(1)建立表orm
create table t_user( userid bigint, --用户id gender string, --性别 age int, --年龄 occupation string, --职业 zipcode string) --邮政编码 row format serde 'org.apache.hadoop.hive.serde2.RegexSerDe' with serdeproperties('input.regex'='(.*)::(.*)::(.*)::(.*)::(.*)','output.format.string'='%1$s %2$s %3$s %4$s %5$s') stored as textfile; create table t_movie( movieid bigint, --电影ID title string, --电影名字 genres string) --电影类型 row format serde 'org.apache.hadoop.hive.serde2.RegexSerDe' with serdeproperties('input.regex'='(.*)::(.*)::(.*)','output.format.string'='%1$s %2$s %3$s') stored as textfile; create table t_rating( userid bigint, --用户ID movieid bigint, --电影ID rating double, --评分 timestamped String) --评分时间戳 row format serde 'org.apache.hadoop.hive.serde2.RegexSerDe' with serdeproperties('input.regex'='(.*)::(.*)::(.*)::(.*)','output.format.string'='%1$s %2$s %3$s %4$s') stored as textfile;
(2)导入数据blog
load data local inpath '/usr/mydir/data/users.dat' into table t_user; load data local inpath '/usr/mydir/data/movies.dat' into table t_movie; load data local inpath '/usr/mydir/data/ratings.dat' into table t_rating;
(3)验证数据ip
select * from t_user; select count(1) from t_user ; --6040条数据 select * from t_movie; select count(1) from t_movie ; --3883条数据 select * from t_rating; select count(1) from t_rating ; --1000209条数据
user表:hadoop
movie表:
rating表:
二、求被评分次数最多的10部电影,并给出评分次数(电影名,评分次数)
SELECT title, rating_count FROM (SELECT movieid, COUNT(rating) rating_count from t_rating group by movieid ) t1 left join t_movie t2 on t1.movieid = t2.movieid order by rating_count DESC LIMIT 10;
三、分别求男性,女性当中评分最高的10部电影(性别,电影名,影评分),评分人数大于等于50
SELECT * from (SELECT 'M' as gender, title, AVG(rating) avg_rating, count(rating) count_rating from (SELECT --三张表关联,拿到性别,电影名,影评分 t2.gender, t3.title, t1.rating from t_rating t1 left join t_user t2 on t1.userid = t2.userid left join t_movie t3 on t1.movieid = t3.movieid where t2.gender = 'M' ) t4 group by title HAVING count_rating >= 50 order by avg_rating DESC LIMIT 10 ) a UNION ALL SELECT * from (SELECT 'F' as gender, title, AVG(rating) avg_rating, count(rating) count_rating from (SELECT --三张表关联,拿到性别,电影名,影评分 t2.gender, t3.title, t1.rating from t_rating t1 left join t_user t2 on t1.userid = t2.userid left join t_movie t3 on t1.movieid = t3.movieid where t2.gender = 'F' ) t4 group by title HAVING count_rating >= 50 order by avg_rating DESC LIMIT 10 ) b
四、求好片(评分>=4.0)最多的那个年份的最好看的10部电影
SELECT title, avg_rating from (SELECT t1.title, SUBSTRING(t1.title,-5,4) year, avg(t2.rating) avg_rating from t_movie t1 left join t_rating t2 on t1.movieid = t2.movieid where t2.rating >= 4 group by t1.title ) t4 where year IN (SELECT year from (SELECT year, COUNT(year) count_year from (SELECT t1.title, SUBSTRING(t1.title,-5,4) year from t_movie t1 left join t_rating t2 on t1.movieid = t2.movieid where t2.rating >= 4 group by t1.title ) t3 group by year order by count_year DESC LIMIT 1) t5 ) order by avg_rating DESC limit 10;
五、求1997年上映的电影中,评分最高的10部Comedy类电影
SELECT title, genres, AVG(rating) avg_rating from (SELECT t1.title, substr(t1.title,-5,4) year, t1.genres, t2.rating from t_movie t1 left join t_rating t2 on t1.movieid = t2.movieid ) t3 where year = '1997' and genres LIKE '%Comedy%' group by title,genres order by avg_rating DESC LIMIT 10;
六、该影评库中各类类型电影中评价最高的5部电影(类型,电影名,平均影评分)
SELECT type, title FROM (SELECT title, type, ROW_NUMBER() over(partition by type order by avg_rating DESC) rn from (SELECT title, type, AVG(rating) avg_rating from (select t2.genres, t2.title, t1.rating from t_rating t1 left join t_movie t2 on t1.movieid =t2.movieid ) t3 lateral view explode(split(t3.genres,"\\|")) tmp as type group by type,title ) t4 )t5 where rn <= 5;
七、各年评分最高的电影类型(年份,类型,影评分)
with tmp as (SELECT title, type, rating, movie_year from (select t2.genres, t2.title, t1.rating, SUBSTRING(t2.title,-5,4) as movie_year from t_rating t1 left join t_movie t2 on t1.movieid =t2.movieid ) t3 lateral view explode(split(t3.genres,"\\|")) tmp as type ) select movie_year, type, avg_rating from (select movie_year, type, avg_rating, ROW_NUMBER() over(partition by movie_year order by avg_rating DESC) rn FROM (select movie_year, type, AVG(rating) avg_rating FROM tmp group by movie_year,type ) t1 ) t2 where rn = 1;