-
Notifications
You must be signed in to change notification settings - Fork 0
/
pickThree.py
28 lines (25 loc) · 829 Bytes
/
pickThree.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
#!/usr/bin/env python
from pyspark import SparkContext, RDD
from pyspark.sql import SparkSession
from readCSVFile import readCSVFile
def pickThree(beerRDD):
""" Pick three beers to recommand
@type beerRDD: RDD
@rtype: list
"""
topThreeBeer = beerRDD \
.map(lambda x: (x.beer_name, (x.review_overall, 1))) \
.reduceByKey(lambda x,y: (x[0] + y[0], x[1] + y[1])) \
.filter(lambda x: x[1][1] > 1) \
.mapValues(lambda x: ((x[0] + 0.0) / x[1])) \
.top(3, key = lambda x: x[1])
return topThreeBeer
if __name__ == "__main__":
fileName = "/Users/wtchen/Research/beerAdvocate/beer_reviews/beer_reviews.csv"
sc = SparkContext()
spark = SparkSession(sc)
df = readCSVFile(fileName, spark, numOfPartition = 4)
rdd = df.rdd
topThreeBeer = pickThree(rdd)
print "The three beers I recommend are: ",
print topThreeBeer