--day 1 data--
pk a pk b pk c
1 x 1 y - -
--stage table1
pk a b c
1 x null null
1 null y null
----- union and group by on pk, get max on each col
--INSERT OVERWRITE TABLE stage_tbl SELECT pk,max(a),max(b),max(c) FROM stage_tbl1 group by pk;
--stage_tbl
pk a b c
1 x y null
--First time load to Final table
--final_tbl
pk a b c timestamp
1 x y null 2017-10-18 11:00:04
--day 2 data--
pk a pk b pk c
- - - - 1 z
2 a 2 b 2 c
3 p 1 o 3 r
--stage table1
pk a b c
1 null o null
1 null null z
2 a null null
2 null b null
2 null null c
3 p null null
3 null null r
----- union and group by on pk, get max on each col
--INSERT OVERWRITE TABLE stage_tbl SELECT pk,max(a),max(b),max(c) FROM stage_tbl1 group by pk;
--stage_tbl
pk a b c
1 null o z
2 a b c
3 p null r
---delta load to final table
Set1 : select * from final_tbl f where pk in (select pk from stage_tbl)
Set2 : select pk, COALESCE(s.a,f.a) as a, COALESCE(s.b,f.b) as b, COALESCE(s.c,f.c) as c from stage_tbl s left join Set1 f on s.pk=f.pk;
Set3 : select * from final_tbl f where pk not in (select pk from stage_tbl)
insert overwrite table final_tbl (select pk,a,b,c,current_time() from Set2 UNION select pk,a,b,c,current_time() from Set3);
Note: Consider utilizing partitions and buckets on attributes/cols like category for improved performance
pk a pk b pk c
1 x 1 y - -
--stage table1
pk a b c
1 x null null
1 null y null
----- union and group by on pk, get max on each col
--INSERT OVERWRITE TABLE stage_tbl SELECT pk,max(a),max(b),max(c) FROM stage_tbl1 group by pk;
--stage_tbl
pk a b c
1 x y null
--First time load to Final table
--final_tbl
pk a b c timestamp
1 x y null 2017-10-18 11:00:04
--day 2 data--
pk a pk b pk c
- - - - 1 z
2 a 2 b 2 c
3 p 1 o 3 r
--stage table1
pk a b c
1 null o null
1 null null z
2 a null null
2 null b null
2 null null c
3 p null null
3 null null r
----- union and group by on pk, get max on each col
--INSERT OVERWRITE TABLE stage_tbl SELECT pk,max(a),max(b),max(c) FROM stage_tbl1 group by pk;
--stage_tbl
pk a b c
1 null o z
2 a b c
3 p null r
---delta load to final table
Set1 : select * from final_tbl f where pk in (select pk from stage_tbl)
Set2 : select pk, COALESCE(s.a,f.a) as a, COALESCE(s.b,f.b) as b, COALESCE(s.c,f.c) as c from stage_tbl s left join Set1 f on s.pk=f.pk;
Set3 : select * from final_tbl f where pk not in (select pk from stage_tbl)
insert overwrite table final_tbl (select pk,a,b,c,current_time() from Set2 UNION select pk,a,b,c,current_time() from Set3);
Note: Consider utilizing partitions and buckets on attributes/cols like category for improved performance
No comments:
Post a Comment