..

Hive

Sample record

{
    // STRING, 22 character unique STRING business id
    "business_id": "tnhfDv5Il8EaGSXZGiuQGg",

    // STRING, the business's name
    "name": "Garaje",

    // STRING, the full address of the business
    "address": "475 3rd St",

    // STRING, the city
    "city": "San Francisco",

    // STRING, 2 character state code, if applicable
    "state": "CA",

    // STRING, the postal code
    "postal code": "94107",

    // FLOAT, latitude
    "latitude": 37.7817529521,

    // FLOAT, longitude
    "longitude": -122.39612197,

    // FLOAT, star rating, rounded to half-stars
    "stars": 4.5,

    // integer, number of reviews
    "review_count": 1198,

    // integer, 0 or 1 for closed or open, respectively
    "is_open": 1,

    // object, business attributes to values. note: some attribute values might be objects
    "attributes": {
        "RestaurantsTakeOut": true,
        "BusinessParking": {
            "garage": false,
            "street": true,
            "validated": false,
            "lot": false,
            "valet": false
        },
    },

    // an array of STRINGs of business categories
    "categories": [
        "Mexican",
        "Burgers",
        "Gastropubs"
    ],

    // an object of key day to value hours, hours are using a 24hr clock
    "hours": {
        "Monday": "10:00-21:00",
        "Tuesday": "10:00-21:00",
        "Friday": "10:00-21:00",
        "Wednesday": "10:00-21:00",
        "Thursday": "10:00-21:00",
        "Sunday": "11:00-18:00",
        "Saturday": "10:00-21:00"
    }

}

Hive Commands

Loading the dataset

CREATE DATABASE yelp;
USE yelp;
CREATE TABLE business_json_parsing (json STRING);
LOAD DATA INPATH "/user/hduser/yelp/business.json" INTO TABLE business_json_parsing;
SELECT json_tuple(json, "business_id", "name", "address", "city", "state", "stars")  from (select * from business_json_parsing) as q;
CREATE TABLE business (business_id string, name string, address string, city string, state string, stars float);
INSERT INTO business SELECT json_tuple(json, "business_id", "name", "address", "city", "state", "stars")  from (select * from business_json_parsing) as q;

Average rating per state

SELECT state, AVG(stars) FROM business GROUP BY state;
+--------+---------------------+
| state  |         _c1         |
+--------+---------------------+
| AB     | 3.447514803516957   |
| AZ     | 3.5920096852300243  |
| CA     | 3.9967326542379396  |
| CO     | 4.0                 |
| DE     | 3.3549668874172185  |
| FL     | 3.6109570831750855  |
| HI     | 4.25                |
| ID     | 3.7076337586747257  |
| IL     | 3.3696969696969696  |
| IN     | 3.5882457544234017  |
| LA     | 3.679161628375655   |
| MA     | 1.25                |
| MI     | 2.5                 |
| MO     | 3.546091817098873   |
| MT     | 5.0                 |
| NC     | 2.0                 |
| NJ     | 3.4591143392689783  |
| NV     | 3.7368762151652626  |
| PA     | 3.5730191838773173  |
| SD     | 4.5                 |
| TN     | 3.571499668214997   |
| TX     | 2.875               |
| UT     | 4.5                 |
| VI     | 2.5                 |
| VT     | 4.5                 |
| WA     | 3.5                 |
| XMS    | 4.0                 |
+--------+---------------------+

Pasted image 20231031132216

Number of restaurants per state

SELECT state, COUNT(business_id) FROM business GROUP BY state;
+--------+--------+
| state  |  _c1   |
+--------+--------+
| AB     | 5573   |
| AZ     | 9912   |
| CA     | 5203   |
| CO     | 3      |
| DE     | 2265   |
| FL     | 26330  |
| HI     | 2      |
| ID     | 4467   |
| IL     | 2145   |
| IN     | 11247  |
| LA     | 9924   |
| MA     | 2      |
| MI     | 1      |
| MO     | 10913  |
| MT     | 1      |
| NC     | 1      |
| NJ     | 8536   |
| NV     | 7715   |
| PA     | 34039  |
| SD     | 1      |
| TN     | 12056  |
| TX     | 4      |
| UT     | 1      |
| VI     | 1      |
| VT     | 1      |
| WA     | 2      |
| XMS    | 1      |
+--------+--------+

Pasted image 20231031132325

Number of distinct Restaurants

SELECT COUNT(DISTINCT(name)) FROM business;

Pasted image 20231031132445

Top Rated Restaurant in each state

SELECT a.state, a.stars, a.name FROM business a INNER JOIN (SELECT state, MAX(stars) stars FROM business GROUP BY state) b ON a.state = b.state AND a.stars = b.stars;

Pasted image 20231031135419

Restaurants with the most branches

SELECT name, count(address) locations FROM business GROUP BY name HAVING locations > 1 ORDER BY locations DESC;

Pasted image 20231031140149

Restaurants which covers the most states

SELECT name, COUNT(DISTINCT(state)) state FROM business GROUP BY name HAVING state >1 ORDER BY state;

Pasted image 20231031140446

Number of branches per state

SELECT name, state, COUNT(address) address FROM business GROUP BY name, state HAVING address > 1 ORDER BY address;

Pasted image 20231031140703