..
Hive
Sample record
{
// STRING, 22 character unique STRING business id
"business_id": "tnhfDv5Il8EaGSXZGiuQGg",
// STRING, the business's name
"name": "Garaje",
// STRING, the full address of the business
"address": "475 3rd St",
// STRING, the city
"city": "San Francisco",
// STRING, 2 character state code, if applicable
"state": "CA",
// STRING, the postal code
"postal code": "94107",
// FLOAT, latitude
"latitude": 37.7817529521,
// FLOAT, longitude
"longitude": -122.39612197,
// FLOAT, star rating, rounded to half-stars
"stars": 4.5,
// integer, number of reviews
"review_count": 1198,
// integer, 0 or 1 for closed or open, respectively
"is_open": 1,
// object, business attributes to values. note: some attribute values might be objects
"attributes": {
"RestaurantsTakeOut": true,
"BusinessParking": {
"garage": false,
"street": true,
"validated": false,
"lot": false,
"valet": false
},
},
// an array of STRINGs of business categories
"categories": [
"Mexican",
"Burgers",
"Gastropubs"
],
// an object of key day to value hours, hours are using a 24hr clock
"hours": {
"Monday": "10:00-21:00",
"Tuesday": "10:00-21:00",
"Friday": "10:00-21:00",
"Wednesday": "10:00-21:00",
"Thursday": "10:00-21:00",
"Sunday": "11:00-18:00",
"Saturday": "10:00-21:00"
}
}
Hive Commands
Loading the dataset
CREATE DATABASE yelp;
USE yelp;
CREATE TABLE business_json_parsing (json STRING);
LOAD DATA INPATH "/user/hduser/yelp/business.json" INTO TABLE business_json_parsing;
SELECT json_tuple(json, "business_id", "name", "address", "city", "state", "stars") from (select * from business_json_parsing) as q;
CREATE TABLE business (business_id string, name string, address string, city string, state string, stars float);
INSERT INTO business SELECT json_tuple(json, "business_id", "name", "address", "city", "state", "stars") from (select * from business_json_parsing) as q;
Average rating per state
SELECT state, AVG(stars) FROM business GROUP BY state;
+--------+---------------------+
| state | _c1 |
+--------+---------------------+
| AB | 3.447514803516957 |
| AZ | 3.5920096852300243 |
| CA | 3.9967326542379396 |
| CO | 4.0 |
| DE | 3.3549668874172185 |
| FL | 3.6109570831750855 |
| HI | 4.25 |
| ID | 3.7076337586747257 |
| IL | 3.3696969696969696 |
| IN | 3.5882457544234017 |
| LA | 3.679161628375655 |
| MA | 1.25 |
| MI | 2.5 |
| MO | 3.546091817098873 |
| MT | 5.0 |
| NC | 2.0 |
| NJ | 3.4591143392689783 |
| NV | 3.7368762151652626 |
| PA | 3.5730191838773173 |
| SD | 4.5 |
| TN | 3.571499668214997 |
| TX | 2.875 |
| UT | 4.5 |
| VI | 2.5 |
| VT | 4.5 |
| WA | 3.5 |
| XMS | 4.0 |
+--------+---------------------+
Number of restaurants per state
SELECT state, COUNT(business_id) FROM business GROUP BY state;
+--------+--------+
| state | _c1 |
+--------+--------+
| AB | 5573 |
| AZ | 9912 |
| CA | 5203 |
| CO | 3 |
| DE | 2265 |
| FL | 26330 |
| HI | 2 |
| ID | 4467 |
| IL | 2145 |
| IN | 11247 |
| LA | 9924 |
| MA | 2 |
| MI | 1 |
| MO | 10913 |
| MT | 1 |
| NC | 1 |
| NJ | 8536 |
| NV | 7715 |
| PA | 34039 |
| SD | 1 |
| TN | 12056 |
| TX | 4 |
| UT | 1 |
| VI | 1 |
| VT | 1 |
| WA | 2 |
| XMS | 1 |
+--------+--------+
Number of distinct Restaurants
SELECT COUNT(DISTINCT(name)) FROM business;
Top Rated Restaurant in each state
SELECT a.state, a.stars, a.name FROM business a INNER JOIN (SELECT state, MAX(stars) stars FROM business GROUP BY state) b ON a.state = b.state AND a.stars = b.stars;
Restaurants with the most branches
SELECT name, count(address) locations FROM business GROUP BY name HAVING locations > 1 ORDER BY locations DESC;
Restaurants which covers the most states
SELECT name, COUNT(DISTINCT(state)) state FROM business GROUP BY name HAVING state >1 ORDER BY state;
Number of branches per state
SELECT name, state, COUNT(address) address FROM business GROUP BY name, state HAVING address > 1 ORDER BY address;