..

Hive

Sample Record

{
    // string, 22 character unique user id, maps to the user in user.json
    "user_id": "Ha3iJu77CxlrFm-vQRs_8g",

    // string, the user's first name
    "name": "Sebastien",

    // integer, the number of reviews they've written
    "review_count": 56,

    // string, when the user joined Yelp, formatted like YYYY-MM-DD
    "yelping_since": "2011-01-01",

    // array of strings, an array of the user's friend as user_ids
    "friends": [
        "wqoXYLWmpkEH0YvTmHBsJQ",
        "KUXLLiJGrjtSsapmxmpvTA",
        "6e9rJKQC3n0RSKyHLViL-Q"
    ],

    // integer, number of useful votes sent by the user
    "useful": 21,

    // integer, number of funny votes sent by the user
    "funny": 88,

    // integer, number of cool votes sent by the user
    "cool": 15,

    // integer, number of fans the user has
    "fans": 1032,

    // array of integers, the years the user was elite
    "elite": [
        2012,
        2013
    ],

    // float, average rating of all reviews
    "average_stars": 4.31,

    // integer, number of hot compliments received by the user
    "compliment_hot": 339,

    // integer, number of more compliments received by the user
    "compliment_more": 668,

    // integer, number of profile compliments received by the user
    "compliment_profile": 42,

    // integer, number of cute compliments received by the user
    "compliment_cute": 62,

    // integer, number of list compliments received by the user
    "compliment_list": 37,

    // integer, number of note compliments received by the user
    "compliment_note": 356,

    // integer, number of plain compliments received by the user
    "compliment_plain": 68,

    // integer, number of cool compliments received by the user
    "compliment_cool": 91,

    // integer, number of funny compliments received by the user
    "compliment_funny": 99,

    // integer, number of writer compliments received by the user
    "compliment_writer": 95,

    // integer, number of photo compliments received by the user
    "compliment_photos": 50
}

Hive commands

USE yelp;
CREATE TABLE user_json_parsing (json STRING);
LOAD DATA INPATH "/user/hduser/user.json" INTO TABLE user_json_parsing;
SELECT json_tuple(json, "user_id","name","review_count","yelping_since","friends","useful","funny","cool","fans","elite","average_stars","compliment_hot","compliment_more","compliment_profile","compliment_cute","compliment_list","compliment_note","compliment_plain","compliment_cool","compliment_funny","compliment_writer","compliment_photos")  from (select * from user_json_parsing) as q;
SELECT json_tuple(json, "user_id","name","review_count","yelping_since","friends","useful","funny","cool","fans","elite","average_stars","compliment_hot","compliment_more","compliment_profile","compliment_cute","compliment_list","compliment_note","compliment_plain","compliment_cool","compliment_funny","compliment_writer","compliment_photos")
as (user_id,name,review_count,yelping_since,friends,useful,funny,cool,fans,elite,average_stars,compliment_hot,compliment_more,compliment_profile,compliment_cute,compliment_list,compliment_note,compliment_plain,compliment_cool,compliment_funny,compliment_writer,compliment_photos)
from (select * from user_json_parsing) as q;
SELECT * FROM (SELECT json_tuple(json, "user_id","name","review_count","yelping_since","friends","useful","funny","cool","fans","elite","average_stars","compliment_hot","compliment_more","compliment_profile","compliment_cute","compliment_list","compliment_note","compliment_plain","compliment_cool","compliment_funny","compliment_writer","compliment_photos")
as (user_id,name,review_count,yelping_since,friends,useful,funny,cool,fans,elite,average_stars,compliment_hot,compliment_more,compliment_profile,compliment_cute,compliment_list,compliment_note,compliment_plain,compliment_cool,compliment_funny,compliment_writer,compliment_photos)
from (select * from user_json_parsing) as q) as p;
SELECT user_id, name, review_count, yelping_since, ARRAY(useful, funny, cool) as votes, (SPLIT(elite, ',')) as elite, average_stars, named_struct("compliment_hot",compliment_hot, "compliment_more",compliment_more, "compliment_profile",compliment_profile, "compliment_cute",compliment_cute, "compliment_list",compliment_list, "compliment_note",compliment_note, "compliment_plain",compliment_plain, "compliment_cool",compliment_cool, "compliment_funny",compliment_funny, "compliment_writer",compliment_writer, "compliment_photos",compliment_photos) FROM (SELECT json_tuple(json, "user_id","name","review_count","yelping_since","friends","useful","funny","cool","fans","elite","average_stars","compliment_hot","compliment_more","compliment_profile","compliment_cute","compliment_list","compliment_note","compliment_plain","compliment_cool","compliment_funny","compliment_writer","compliment_photos") as (user_id,name,review_count,yelping_since,friends,useful,funny,cool,fans,elite,average_stars,compliment_hot,compliment_more,compliment_profile,compliment_cute,compliment_list,compliment_note,compliment_plain,compliment_cool,compliment_funny,compliment_writer,compliment_photos) from (select * from user_json_parsing) as q) as p limit 10;
 SELECT user_id, name, review_count, CAST(date_format(yelping_since, "YYYY") AS INT) as year_joined,
 named_struct("useful", useful,"funny", funny,"cool", cool) as votes,
 (SPLIT(elite, ',')) as elite, average_stars,
 named_struct("compliment_hot",compliment_hot,
 "compliment_more",compliment_more, "compliment_profile",compliment_profile,
 "compliment_cute",compliment_cute, "compliment_list",compliment_list,
 "compliment_note",compliment_note, "compliment_plain",compliment_plain,
 "compliment_cool",compliment_cool, "compliment_funny",compliment_funny,
 "compliment_writer",compliment_writer, "compliment_photos",compliment_photos)
 as compliment FROM (SELECT json_tuple(json,
 "user_id","name","review_count","yelping_since","friends","useful","funny","cool","fans","elite","average_stars","compliment_hot","compliment_more","compliment_profile","compliment_cute","compliment_list","compliment_note","compliment_plain","compliment_cool","compliment_funny","compliment_writer","compliment_photos")
 as
 (user_id,name,review_count,yelping_since,friends,useful,funny,cool,fans,elite,average_stars,compliment_hot,compliment_more,compliment_profile,compliment_cute,compliment_list,compliment_note,compliment_plain,compliment_cool,compliment_funny,compliment_writer,compliment_photos)
 from (select * from user_json_parsing) as q) as p limit 5;

Creating table

CREATE TABLE users ( user_id string, name string, review_count string, votes struct<useful:string,funny:string,cool:string>, elite array<string>, average_stars   string, compliment struct<compliment_hot:string,compliment_more:string,compliment_profile:string,compliment_cute:string,compliment_list:string,compliment_note:string,compliment_plain:string,compliment_cool:string,compliment_funny:string,compliment_writer:string,compliment_photos:string>) PARTITIONED BY (year_joined INT)

Inserting

INSERT OVERWRITE TABLE users PARTITION(year_joined) SELECT user_id, name, review_count,  named_struct("useful", useful,"funny", funny,"cool", cool) as votes, (SPLIT(elite, ',')) as elite, average_stars, named_struct("compliment_hot",compliment_hot, "compliment_more",compliment_more, "compliment_profile",compliment_profile, "compliment_cute",compliment_cute, "compliment_list",compliment_list, "compliment_note",compliment_note, "compliment_plain",compliment_plain, "compliment_cool",compliment_cool, "compliment_funny",compliment_funny, "compliment_writer",compliment_writer, "compliment_photos",compliment_photos) as compliment, CAST(date_format(yelping_since, "YYYY") AS INT) as year_joined FROM (SELECT json_tuple(json, "user_id","name","review_count","yelping_since","friends","useful","funny","cool","fans","elite","average_stars","compliment_hot","compliment_more","compliment_profile","compliment_cute","compliment_list","compliment_note","compliment_plain","compliment_cool","compliment_funny","compliment_writer","compliment_photos") as (user_id,name,review_count,yelping_since,friends,useful,funny,cool,fans,elite,average_stars,compliment_hot,compliment_more,compliment_profile,compliment_cute,compliment_list,compliment_note,compliment_plain,compliment_cool,compliment_funny,compliment_writer,compliment_photos) from (select * from user_json_parsing) as q) as p;

Creating partitions

Pasted image 20231107135856

Physical Location

Pasted image 20231107140443

User with the most number of elite years

SELECT name, SIZE(elite) as nr_elite_years FROM users ORDER BY nr_elite_years DESC LIMIT 5;

Pasted image 20231107143134

Reviews Tables