..
Pig
Loading Data
tips_json_parsing = LOAD '/user/hduser/yelp/tip.json' USING JsonLoader('user_id:chararray, business_id:chararray, text: chararray, data: chararray, compliment_count:int');
Number of comments by each user
group_user = GROUP tips_json_parsing BY user_id;
tmp = FOREACH group_user GENERATE group, COUNT(tips_json_parsing);
dump tmp;
User with the most comments
tmp2 = ORDER tmp by $1
User with most compliments
group_user = GROUP tips_json_parsing BY user_id;
tmp = FOREACH group_user GENERATE (tips_json_parsing.compliment_count, SUM(tips_json_parsing.compliment_count));
tmp2 = ORDER tmp BY $0;
dump tmp2;
Restaurant with the most compliments
group_user = GROUP tips_json_parsing BY business_id;
tmp = GROUP tips_json_parsing BY business_id;
tmp2 = FOREACH tmp GENERATE group, COUNT(tips_json_parsing.compliment_count);
tmp2 = ORDER tmp2 by $1;
dump tmp2
Each restaurant’s most reviewed day
date_split_data = FOREACH tips_json_parsing GENERATE user_id, business_id, compliment_count, text, FLATTEN(STRSPLIT(data, ' ', 2)) AS (date:chararray, time:chararray);
group_data = GROUP date_split_data BY (business_id, date);
tmp = FOREACH group_data GENERATE group, COUNT(date_split_data.time);
tmp2 = ORDER tmp BY $1;