..

Pig

Loading Data

tips_json_parsing = LOAD '/user/hduser/yelp/tip.json' USING JsonLoader('user_id:chararray, business_id:chararray, text: chararray, data: chararray, compliment_count:int');

Number of comments by each user

group_user = GROUP tips_json_parsing  BY user_id;
tmp = FOREACH group_user GENERATE group, COUNT(tips_json_parsing);
dump tmp;

Pasted image 20231031144013

User with the most comments

tmp2 = ORDER tmp by $1

Pasted image 20231031144252

User with most compliments

group_user = GROUP tips_json_parsing  BY user_id;
tmp = FOREACH group_user GENERATE (tips_json_parsing.compliment_count, SUM(tips_json_parsing.compliment_count));
tmp2 = ORDER tmp BY $0;
dump tmp2;

Pasted image 20231031145139

Restaurant with the most compliments

group_user = GROUP tips_json_parsing  BY business_id;
tmp = GROUP tips_json_parsing BY business_id;
tmp2 = FOREACH tmp GENERATE group, COUNT(tips_json_parsing.compliment_count);
tmp2 = ORDER tmp2 by $1;
dump tmp2

Pasted image 20231031150125

Each restaurant’s most reviewed day

date_split_data = FOREACH tips_json_parsing GENERATE user_id, business_id, compliment_count, text, FLATTEN(STRSPLIT(data, ' ', 2)) AS (date:chararray, time:chararray);
group_data = GROUP date_split_data BY (business_id, date);
tmp = FOREACH group_data GENERATE group, COUNT(date_split_data.time);
tmp2 = ORDER tmp BY $1;

Pasted image 20231031160221