首页 诗词 字典 板报 句子 名言 友答 励志 学校 网站地图
当前位置: 首页 > 教程频道 > 其他教程 > 互联网 >

hive 多字段与此同时count(distinct)优化

2012-07-16 
hive 多字段同时count(distinct)优化1.??? 需求与现状:源表:pcup_3month_login_dtl_mes , 记录数12亿,文件

hive 多字段同时count(distinct)优化

1.??? 需求与现状:
源表:pcup_3month_login_dtl_mes , 记录数12亿,文件数 300
统计SQL:

insert overwrite table pcup_logininfo_tmp partition(data_type = 1)  select popt_id,         null as sndaid,         count(distinct case when login_date>='2012-02-01' and login_date<'2012-05-01' then login_date else null end) as m3_login,         null as m3_login_top5,         count(distinct case when login_date>='2012-05-01' and login_date<='2012-05-09' then login_date else null end) as mn_login,         null as mn_login_top5,         null as m3_apptype,         null as mn_apptype,         count(distinct case when login_date>='2012-02-01' and login_date<'2012-05-01' and apptypeid='1' then login_date else null end) as m3_g_login,         null as m3_g_login_top5,         count(distinct case when login_date>='2012-02-01' and login_date<'2012-05-01' and apptypeid='2' then login_date else null end) as m3_l_login,         null as m3_l_login_top5,         count(distinct case when login_date>='2012-02-01' and login_date<'2012-05-01' and apptypeid='3' then login_date else null end) as m3_s_login,         null as m3_s_login_top5,         count(distinct case when login_date>='2012-02-01' and login_date<'2012-05-01' and apptypeid='4' then login_date else null end) as m3_o_login,         null as m3_o_login_top5,         count(distinct case when login_date>='2012-05-01' and login_date<='2012-05-09' and apptypeid='1' then login_date else null end) as mn_g_login,         null as mn_g_login_top5,         count(distinct case when login_date>='2012-05-01' and login_date<='2012-05-09' and apptypeid='2' then login_date else null end) as mn_l_login,         null as mn_l_login_top5,         count(distinct case when login_date>='2012-05-01' and login_date<='2012-05-09' and apptypeid='3' then login_date else null end) as mn_s_login,         null as mn_s_login_top5,         count(distinct case when login_date>='2012-05-01' and login_date<='2012-05-09' and apptypeid='4' then login_date else null end) as mn_o_login,         null as mn_o_login_top5  from pcup_3month_login_dtl_mes  group by popt_id;

?特点:group by 维度少,多字段count(distinct), reduce task非常少(7个)
耗时:1个半小时以上

?

2.??? 优化思路:

利用union all + group by + rownumber 代替所有的count(distinct);
根据文件大小设置合理的reduce task数量;

?

3.??? 优化后的代码:耗时20分钟左右

SET mapred.reduce.tasks = 100;

?

//初步过滤+去重

create table lxw_test3 as select popt_id,login_date,apptypeid from pcup_3month_login_dtl_mes where login_date>='2012-02-01' and login_date <= '2012-05-09' group by popt_id,login_date,apptypeid;

?

//利用rownumber 函数做去重标记

?

add jar hdfs://nn.dc.sh-wgq.sdo.com:8020/group/p_sdo_data/udf/snda_udf.jar;CREATE TEMPORARY FUNCTION row_number AS 'com.snda.hive.udf.UDFrow_number';                   create table lxw_test4 as select type,popt_id,login_date,row_number(type,login_date,popt_id) as rn from (       select type,popt_id,login_date        from (                select 'm3_login' as type,popt_id,login_date                  from lxw_test3                 where login_date>='2012-02-01' and login_date<'2012-05-01'                 union all                 select 'mn_login' as type,popt_id,login_date                 from lxw_test3                 where login_date>='2012-05-01' and login_date<='2012-05-09'                 union all                 select 'm3_g_login' as type,popt_id,login_date                 from lxw_test3                 where login_date>='2012-02-01' and login_date<'2012-05-01' and apptypeid='1'                 union all                 select 'm3_l_login' as type,popt_id,login_date                 from lxw_test3                 where login_date>='2012-02-01' and login_date<'2012-05-01' and apptypeid='2'                 union all                 select 'm3_s_login' as type,popt_id,login_date                 from lxw_test3                 where login_date>='2012-02-01' and login_date<'2012-05-01' and apptypeid='3'                 union all                 select 'm3_o_login' as type,popt_id,login_date                 from lxw_test3                 where login_date>='2012-02-01' and login_date<'2012-05-01' and apptypeid='4'                 union all                 select 'mn_g_login' as type,popt_id,login_date                 from lxw_test3                 where login_date>='2012-05-01' and login_date<='2012-05-09' and apptypeid='1'                 union all                 select 'mn_l_login' as type,popt_id,login_date                 from lxw_test3                 where login_date>='2012-05-01' and login_date<='2012-05-09' and apptypeid='2'                 union all                 select 'mn_s_login' as type,popt_id,login_date                 from lxw_test3                 where login_date>='2012-05-01' and login_date<='2012-05-09' and apptypeid='3'                 union all                 select 'mn_o_login' as type,popt_id,login_date                 from lxw_test3                 where login_date>='2012-05-01' and login_date<='2012-05-09' and apptypeid='4'        ) x        distribute by type,login_date,popt_id sort by type,login_date,popt_id ) y;
?

?

//用普通的聚合函数进行汇总

?

insert overwrite table pcup_logininfo_tmp partition(data_type = 99) select popt_id,null as sndaid,sum(case when type = 'm3_login' and rn = 1 then 1 else 0 end) as m3_login,null as m3_login_top5,sum(case when type = 'mn_login' and rn = 1 then 1 else 0 end) as mn_login,null as mn_login_top5,null as m3_apptype,null as mn_apptype,sum(case when type = 'm3_g_login' and rn = 1 then 1 else 0 end) as m3_g_login,null as m3_g_login_top5,sum(case when type = 'm3_l_login' and rn = 1 then 1 else 0 end) as m3_l_login,null as m3_l_login_top5,sum(case when type = 'm3_s_login' and rn = 1 then 1 else 0 end) as m3_s_login,null as m3_s_login_top5,sum(case when type = 'm3_o_login' and rn = 1 then 1 else 0 end) as m3_o_login,null as m3_o_login_top5,sum(case when type = 'mn_g_login' and rn = 1 then 1 else 0 end) as mn_g_login,null as mn_g_login_top5,sum(case when type = 'mn_l_login' and rn = 1 then 1 else 0 end) as mn_l_login,null as mn_l_login_top5,sum(case when type = 'mn_s_login' and rn = 1 then 1 else 0 end) as mn_s_login,null as mn_s_login_top5,sum(case when type = 'mn_o_login' and rn = 1 then 1 else 0 end) as mn_o_login,null as mn_o_login_top5from lxw_test4 group by popt_id
?

?

?

?

?

热点排行