..

Final Documentation

19CSE357 Evaluation 3 - Pig and Hive

By Deebakkarthi C R [CB.EN.U4CSE20613]

Dataset

  • The dataset used is Yelp Dataset
  • The tables in this dataset is
    • Business
      • business_id
      • name
      • address
      • city
      • state
      • postal code
      • latitude
      • longitude
      • stars
      • review_count
      • is_open
      • attributes
      • categories
      • hours
    • Review
      • review_id
      • user_id
      • business_id
      • stars
      • date
      • text
      • useful
      • funny
      • cool
    • Users
      • user_id
      • name
      • review_count
      • yelping_since
      • friends
      • useful
      • funny
      • cool
      • fans
      • elite
      • average_stars
      • compliment_hot
      • compliment_more
      • compliment_profile
      • compliment_cute
      • compliment_list
      • compliment_note
      • compliment_plain
      • compliment_cool
      • compliment_funny
      • compliment_writer
      • compliment_photos
    • Checkin
      • business_id
      • date
    • Tips
      • text
      • date
      • compliment_count
      • business_id
      • user_id
    • Photos
      • photo_id
      • business_id
      • caption
      • label
Table No. of Records
Business $150,000$
Review $6,900,000$

Salient features of this dataset

  • Rich and diverse data
  • Real-world data
  • Large Scale
  • Social Network aspects

Pig

Basic Queries

Loading Data

tips_json_parsing = LOAD '/user/hduser/yelp/tip.json' USING JsonLoader('user_id:chararray, business_id:chararray, text: chararray, data: chararray, compliment_count:int');

Number of comments by each user

group_user = GROUP tips_json_parsing  BY user_id;
tmp = FOREACH group_user GENERATE group, COUNT(tips_json_parsing);
dump tmp;

Pasted image 20231031144013

User with the most comments

tmp2 = ORDER tmp by $1

Pasted image 20231031144252

User with most compliments

group_user = GROUP tips_json_parsing  BY user_id;
tmp = FOREACH group_user GENERATE (tips_json_parsing.compliment_count, SUM(tips_json_parsing.compliment_count));
tmp2 = ORDER tmp BY $0;
dump tmp2;

Pasted image 20231031145139

Restaurant with the most compliments

group_user = GROUP tips_json_parsing  BY business_id;
tmp = GROUP tips_json_parsing BY business_id;
tmp2 = FOREACH tmp GENERATE group, COUNT(tips_json_parsing.compliment_count);
tmp2 = ORDER tmp2 by $1;
dump tmp2

Pasted image 20231031150125

Complex Queries

Loading the dataset

business_json = LOAD  '/user/hduser/yelp/business.json' USING JsonLoader('business_id:chararray, name:chararray, address:chararray, city:chararray, state:chararray, postal_code:chararray, latitude:float, longitude:float, stars:float, review_count:int, is_open:int');

Pasted image 20231121101432

Correlation Coefficient of review_count and stars

review_and_stars = FOREACH business_json GENERATE (double)stars, (double)review_count;
rel = GROUP review_and_stars BY ALL;
corop = FOREACH rel GENERATE COR(review_and_stars.stars, review_and_stars.review_count);

Pasted image 20231121101319

Which year produced the most number of elite users

Loading the dataset

user_json = LOAD '/user/hduser/yelp/user.json' USING JsonLoader('user_id:chararray, name:chararray, review_count:int, yelping_since:chararray, useful:int, funny:int, cool:int, elite:chararray, friends:chararray, fans:int,  average_stars:float, compliment_hot:int, compliment_more:int, compliment_profile:int, compliment_cute:int, compliment_list:int, compliment_note:int, compliment_plain:int, compliment_cool:int, compliment_funny:int, compliment_writer:int, compliment_photos:int');

Pasted image 20231121110801

year_alone = FOREACH user_json GENERATE (int)SUBSTRING(yelping_since, 0, 4);
nr_elite_years = FOREACH user_json GENERATE SIZE(STRSPLIT(elite, ','));
year_and_nr_elite = FOREACH user_json GENERATE (int)SUBSTRING(yelping_since, 0, 4), SIZE(STRSPLIT(elite, ','));
grouped_data = GROUP year_and_nr_elite BY year;
result = foreach grouped_data GENERATE group, SUM(year_and_nr_elite.nr_elite);
sorted_result = ORDER result by $0;

Pasted image 20231121111433 Pasted image 20231121111612

user_and_nr_friends = FOREACH user_json GENERATE name, SIZE(STRSPLIT(friends, ','));
result = order user_and_nr_friends BY $1;

Pasted image 20231121112554

Most visited restaurant

Loading the dataset

checkin_json = LOAD '/user/hduser/yelp/checkin.json' USING JsonLoader('business_id:chararray, date:chararray');

Pasted image 20231121112950

nr_visited = FOREACH checkin_json GENERATE business_id, SIZE(STRSPLIT(date, ','));

Pasted image 20231121113116 Pasted image 20231121113054

business_id_and_name = FOREACH business_json generate business_id, name;
result = join most_visited by business_id, business_id_and_name by business_id;
result = order result by $1;

Pasted image 20231121113739

UDF

import java.io.IOException;  
import java.util.HashMap;  
import java.util.Map;  
  
import org.apache.pig.EvalFunc;  
import org.apache.pig.data.Tuple;  
  
public class GroupDatesByYear extends EvalFunc<String> {  
    public String convertMapToString(Map<String, Integer> map) {  
        StringBuilder mapAsString = new StringBuilder("{");  
        for (String key : map.keySet()) {  
            mapAsString.append(key + "=" + map.get(key).toString() + ", ");  
        }  
        mapAsString.delete(mapAsString.length() - 2, mapAsString.length()).append("}");  
        return mapAsString.toString();  
    }  
  
    public String exec(Tuple input) throws IOException {  
        if (input == null || input.size() == 0)  
            return null;  
        String str = (String) input.get(0);  
        String[] timestamps = str.split(",");  
        Map<String, Integer> hm = new HashMap<String, Integer>();  
        for (String timestamp : timestamps) {  
            String year = timestamp.strip().split(" ")[0].split("-")[0];  
            Integer tmp = hm.get(year);  
            if (tmp == null) {  
                hm.put(year, 1);  
            } else {  
                hm.put(year, tmp + 1);  
            }  
        }  
        return convertMapToString(hm);  
    }  
}
REGISTER './udf.jar';
checkin_json = LOAD '/user/hduser/yelp/checkin.json' USING JsonLoader('business_id:chararray, date:chararray');
result = FOREACH checkin_json GENERATE GroupDatesByYear(date);
illustrate;

Pasted image 20231121123849


MongoDB

Loading dataset

Pasted image 20231121130843 Pasted image 20231121130941

Average rating per state

db.business.aggregate([
    {
        $group: {
            _id: "$state",
            avgStars: { $avg: "$stars" }
        }
    }
])

Pasted image 20231121131453

Number of restaurants per state

db.business.aggregate([
    {
        $group: {
            _id: "$state",
            count: { $sum: 1 }
        }
    },
    {
        $project: {
            _id: 0,
            state: "$_id",
            businessCount: "$count"
        }
    }
])

Pasted image 20231121131947

Number of distinct Restaurants

db.business.aggregate([
    {
        $group: {
            _id: null,
            distinctNameCount: { $addToSet: "$name" }
        }
    },
    {
        $project: {
            _id: 0,
            distinctNameCount: { $size: "$distinctNameCount" }
        }
    }
])

Pasted image 20231121132043

Top Rated Restaurant in each state

db.business.aggregate([
    {
        $lookup: {
            from: "business",
            let: { state: "$state", stars: "$stars" },
            pipeline: [
                {
                    $group: {
                        _id: "$state",
                        maxStars: { $max: "$stars" }
                    }
                },
                {
                    $match: {
                        $expr: {
                            $and: [
                                { $eq: ["$$state", "$_id"] },
                                { $eq: ["$$stars", "$maxStars"] }
                            ]
                        }
                    }
                }
            ],
            as: "maxStarsData"
        }
    },
    {
        $unwind: "$maxStarsData"
    },
    {
        $match: {
            $expr: {
                $and: [
                    { $eq: ["$state", "$maxStarsData._id"] },
                    { $eq: ["$stars", "$maxStarsData.maxStars"] }
                ]
            }
        }
    },
    {
        $project: {
            _id: 0,
            state: 1,
            stars: 1,
            name: 1
        }
    }
])

Pasted image 20231121132144

Restaurants with the most branches

db.business.aggregate([
    {
        $group: {
            _id: "$name",
            locations: { $addToSet: "$address" },
            count: { $sum: 1 }
        }
    },
    {
        $match: {
            count: { $gt: 1 }
        }
    },
    {
        $sort: {
            count: -1
        }
    },
    {
        $project: {
            _id: 0,
            name: "$_id",
            locations: 1
        }
    }
])

Pasted image 20231121132304

Restaurants which covers the most states

db.business.aggregate([
    {
        $group: {
            _id: "$name",
            uniqueStates: { $addToSet: "$state" },
            count: { $sum: 1 }
        }
    },
    {
        $match: {
            count: { $gt: 1 }
        }
    },
    {
        $sort: {
            count: 1
        }
    },
    {
        $project: {
            _id: 0,
            name: "$_id",
            state: { $size: "$uniqueStates" }
        }
    }
])

Pasted image 20231121132444

Number of branches per state

db.business.aggregate([
    {
        $group: {
            _id: {
                name: "$name",
                state: "$state"
            },
            addressCount: { $sum: 1 }
        }
    },
    {
        $match: {
            addressCount: { $gt: 1 }
        }
    },
    {
        $sort: {
            addressCount: 1
        }
    },
    {
        $project: {
            _id: 0,
            name: "$_id.name",
            state: "$_id.state",
            address: "$addressCount"
        }
    }
])

Pasted image 20231121132531

Complex Queries

db.user.aggregate([
    {
        $project: {
            name: 1,
            nr_friends: {
                $size: {
                    $ifNull: [
                        { $split: ["$friends", ","] },
                        []
                    ]
                }
            }
        }
    },
    {
        $sort: {
            nr_friends: -1
        }
    }
])

Pasted image 20231121135348

Correlation Coefficient of review_count and stars

db.business.aggregate([
    {
        $project: {
            stars: { $toDouble: "$stars" },
            review_count: { $toDouble: "$review_count" }
        }
    },
    {
        $group: {
            _id: null,
            review_and_stars: { $push: { stars: "$stars", review_count: "$review_count" } }
        }
    },
    {
        $project: {
            _id: 0,
            correlation: {
                $function: {
                    body: "function calculateCorrelation(a, b) {const n = a.length, m = a.reduce((s, v) => s + v, 0) / n, k = b.reduce((s, v) => s + v, 0) / n, cov = Array.from({ length: n }, (_, i) => (a[i] - m) * (b[i] - k)).reduce((s, v) => s + v, 0), var1 = a.reduce((s, v) => s + (v - m) * (v - m), 0), var2 = b.reduce((s, v) => s + (v - k) * (v - k), 0); return cov / Math.sqrt(var1 * var2);}
",
                    args: ["$review_and_stars.stars", "$review_and_stars.review_count"],
                    lang: "js"
                }
            }
        }
    }
])

Pasted image 20231121140227

Most visited restaurant

db.checkin.aggregate([
    {
        $project: {
            business_id: 1,
            _id: 0,
            nr_visited: { $size: { $split: ["$date", ","] } }
        }
    },
    {
        $lookup: {
            from: "business",
            localField: "business_id",
            foreignField: "business_id",
            as: "business_info"
        }
    },
    {
        $unwind: "$business_info"
    },
    {
        $project: {
            _id: 0,
            name: "$business_info.name",
            nr_visited: 1
        }
    }
])

Pasted image 20231121142036

Which year produced the most number of elite users

db.user.aggregate([
    {
        $project: {
            year: { $toInt: { $substr: ["$yelping_since", 0, 4] } },
            nr_elite: { $size: { $split: ["$elite", ','] } }
        }
    },
    {
        $group: {
            _id: "$year",
            total_nr_elite: { $sum: "$nr_elite" }
        }
    },
    {
        $sort: {
            _id: 1
        }
    }
])

Pasted image 20231121142433

Finding Average Star by State using MapReduce

var mapFunction = function() {
    emit(this.state, { totalStars: this.stars });
};

var reduceFunction = function(key, values) {
    var reducedObject = { count: 0, totalStars: 0 };

    values.forEach(function(value) {
        reducedObject.count += 1;
        reducedObject.totalStars += value.totalStars;
    });

    reducedObject.averageStars = reducedObject.totalStars / reducedObject.count;

    return { state: key, averageStars: reducedObject.averageStars };
};

db.business.mapReduce(
    mapFunction,
    reduceFunction,
    {
        out: { inline: 1 }
    }
);

Pasted image 20231121143353

Cursor

var cursor = db.business.find({"state": "AZ"})
while (cursor.hasNext()) { 
	var document = cursor.next();
	printjson(document);
}

Pasted image 20231121143626