..
Hive
Sample Record
{
// string, 22 character unique user id, maps to the user in user.json
"user_id": "Ha3iJu77CxlrFm-vQRs_8g",
// string, the user's first name
"name": "Sebastien",
// integer, the number of reviews they've written
"review_count": 56,
// string, when the user joined Yelp, formatted like YYYY-MM-DD
"yelping_since": "2011-01-01",
// array of strings, an array of the user's friend as user_ids
"friends": [
"wqoXYLWmpkEH0YvTmHBsJQ",
"KUXLLiJGrjtSsapmxmpvTA",
"6e9rJKQC3n0RSKyHLViL-Q"
],
// integer, number of useful votes sent by the user
"useful": 21,
// integer, number of funny votes sent by the user
"funny": 88,
// integer, number of cool votes sent by the user
"cool": 15,
// integer, number of fans the user has
"fans": 1032,
// array of integers, the years the user was elite
"elite": [
2012,
2013
],
// float, average rating of all reviews
"average_stars": 4.31,
// integer, number of hot compliments received by the user
"compliment_hot": 339,
// integer, number of more compliments received by the user
"compliment_more": 668,
// integer, number of profile compliments received by the user
"compliment_profile": 42,
// integer, number of cute compliments received by the user
"compliment_cute": 62,
// integer, number of list compliments received by the user
"compliment_list": 37,
// integer, number of note compliments received by the user
"compliment_note": 356,
// integer, number of plain compliments received by the user
"compliment_plain": 68,
// integer, number of cool compliments received by the user
"compliment_cool": 91,
// integer, number of funny compliments received by the user
"compliment_funny": 99,
// integer, number of writer compliments received by the user
"compliment_writer": 95,
// integer, number of photo compliments received by the user
"compliment_photos": 50
}
Hive commands
USE yelp;
CREATE TABLE user_json_parsing (json STRING);
LOAD DATA INPATH "/user/hduser/user.json" INTO TABLE user_json_parsing;
SELECT json_tuple(json, "user_id","name","review_count","yelping_since","friends","useful","funny","cool","fans","elite","average_stars","compliment_hot","compliment_more","compliment_profile","compliment_cute","compliment_list","compliment_note","compliment_plain","compliment_cool","compliment_funny","compliment_writer","compliment_photos") from (select * from user_json_parsing) as q;
SELECT json_tuple(json, "user_id","name","review_count","yelping_since","friends","useful","funny","cool","fans","elite","average_stars","compliment_hot","compliment_more","compliment_profile","compliment_cute","compliment_list","compliment_note","compliment_plain","compliment_cool","compliment_funny","compliment_writer","compliment_photos")
as (user_id,name,review_count,yelping_since,friends,useful,funny,cool,fans,elite,average_stars,compliment_hot,compliment_more,compliment_profile,compliment_cute,compliment_list,compliment_note,compliment_plain,compliment_cool,compliment_funny,compliment_writer,compliment_photos)
from (select * from user_json_parsing) as q;
SELECT * FROM (SELECT json_tuple(json, "user_id","name","review_count","yelping_since","friends","useful","funny","cool","fans","elite","average_stars","compliment_hot","compliment_more","compliment_profile","compliment_cute","compliment_list","compliment_note","compliment_plain","compliment_cool","compliment_funny","compliment_writer","compliment_photos")
as (user_id,name,review_count,yelping_since,friends,useful,funny,cool,fans,elite,average_stars,compliment_hot,compliment_more,compliment_profile,compliment_cute,compliment_list,compliment_note,compliment_plain,compliment_cool,compliment_funny,compliment_writer,compliment_photos)
from (select * from user_json_parsing) as q) as p;
SELECT user_id, name, review_count, yelping_since, ARRAY(useful, funny, cool) as votes, (SPLIT(elite, ',')) as elite, average_stars, named_struct("compliment_hot",compliment_hot, "compliment_more",compliment_more, "compliment_profile",compliment_profile, "compliment_cute",compliment_cute, "compliment_list",compliment_list, "compliment_note",compliment_note, "compliment_plain",compliment_plain, "compliment_cool",compliment_cool, "compliment_funny",compliment_funny, "compliment_writer",compliment_writer, "compliment_photos",compliment_photos) FROM (SELECT json_tuple(json, "user_id","name","review_count","yelping_since","friends","useful","funny","cool","fans","elite","average_stars","compliment_hot","compliment_more","compliment_profile","compliment_cute","compliment_list","compliment_note","compliment_plain","compliment_cool","compliment_funny","compliment_writer","compliment_photos") as (user_id,name,review_count,yelping_since,friends,useful,funny,cool,fans,elite,average_stars,compliment_hot,compliment_more,compliment_profile,compliment_cute,compliment_list,compliment_note,compliment_plain,compliment_cool,compliment_funny,compliment_writer,compliment_photos) from (select * from user_json_parsing) as q) as p limit 10;
SELECT user_id, name, review_count, CAST(date_format(yelping_since, "YYYY") AS INT) as year_joined,
named_struct("useful", useful,"funny", funny,"cool", cool) as votes,
(SPLIT(elite, ',')) as elite, average_stars,
named_struct("compliment_hot",compliment_hot,
"compliment_more",compliment_more, "compliment_profile",compliment_profile,
"compliment_cute",compliment_cute, "compliment_list",compliment_list,
"compliment_note",compliment_note, "compliment_plain",compliment_plain,
"compliment_cool",compliment_cool, "compliment_funny",compliment_funny,
"compliment_writer",compliment_writer, "compliment_photos",compliment_photos)
as compliment FROM (SELECT json_tuple(json,
"user_id","name","review_count","yelping_since","friends","useful","funny","cool","fans","elite","average_stars","compliment_hot","compliment_more","compliment_profile","compliment_cute","compliment_list","compliment_note","compliment_plain","compliment_cool","compliment_funny","compliment_writer","compliment_photos")
as
(user_id,name,review_count,yelping_since,friends,useful,funny,cool,fans,elite,average_stars,compliment_hot,compliment_more,compliment_profile,compliment_cute,compliment_list,compliment_note,compliment_plain,compliment_cool,compliment_funny,compliment_writer,compliment_photos)
from (select * from user_json_parsing) as q) as p limit 5;
Creating table
CREATE TABLE users ( user_id string, name string, review_count string, votes struct<useful:string,funny:string,cool:string>, elite array<string>, average_stars string, compliment struct<compliment_hot:string,compliment_more:string,compliment_profile:string,compliment_cute:string,compliment_list:string,compliment_note:string,compliment_plain:string,compliment_cool:string,compliment_funny:string,compliment_writer:string,compliment_photos:string>) PARTITIONED BY (year_joined INT)
Inserting
INSERT OVERWRITE TABLE users PARTITION(year_joined) SELECT user_id, name, review_count, named_struct("useful", useful,"funny", funny,"cool", cool) as votes, (SPLIT(elite, ',')) as elite, average_stars, named_struct("compliment_hot",compliment_hot, "compliment_more",compliment_more, "compliment_profile",compliment_profile, "compliment_cute",compliment_cute, "compliment_list",compliment_list, "compliment_note",compliment_note, "compliment_plain",compliment_plain, "compliment_cool",compliment_cool, "compliment_funny",compliment_funny, "compliment_writer",compliment_writer, "compliment_photos",compliment_photos) as compliment, CAST(date_format(yelping_since, "YYYY") AS INT) as year_joined FROM (SELECT json_tuple(json, "user_id","name","review_count","yelping_since","friends","useful","funny","cool","fans","elite","average_stars","compliment_hot","compliment_more","compliment_profile","compliment_cute","compliment_list","compliment_note","compliment_plain","compliment_cool","compliment_funny","compliment_writer","compliment_photos") as (user_id,name,review_count,yelping_since,friends,useful,funny,cool,fans,elite,average_stars,compliment_hot,compliment_more,compliment_profile,compliment_cute,compliment_list,compliment_note,compliment_plain,compliment_cool,compliment_funny,compliment_writer,compliment_photos) from (select * from user_json_parsing) as q) as p;
Creating partitions
Physical Location
User with the most number of elite years
SELECT name, SIZE(elite) as nr_elite_years FROM users ORDER BY nr_elite_years DESC LIMIT 5;