-
Notifications
You must be signed in to change notification settings - Fork 0
/
MickeyDz3.scala
164 lines (119 loc) · 7.45 KB
/
MickeyDz3.scala
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
// Import trig functions
/*
Find the clusters of McDonalds Restaurants.
Given this data:
head mcdonalds.csv
-149.868307,61.140133,"McDonalds [WM]-Anchorage,AK","8900 Old Seward Hwy [WM], Anchorage,AK, (907) 344-5831"
-149.88113,61.192426,"McDonalds [WM]-Anchorage,AK","3101 A St [WM], Anchorage,AK, (907) 561-5137"
-149.898633,61.194753,"McDonalds-Anchorage,AK","800 W Northern Lights Blvd, Anchorage,AK, (907) 561-5525"
-149.870565,61.188348,"McDonalds-Anchorage,AK","701 E 36th Ave, Anchorage,AK, (907) 563-0658"
Convert Lat / Long values into X,Y,Z coordinates
Use KMeans clustering to determine the cluster centers.
Note: In a previous exercise, the optimal number of clusters was found to be about 120.
This number was determined by checking the error. The error amount always goes down as
more clusters are formed. However, after a certain point, the error doesn't reduce as fast.
The elbow point is what I am considering as the optimal point.
So I will use the value of 120 for the number of clusters. The umber of iterations was
For each store calculate its distance to the centers
and label it with the center it is closest to.
Find the center that has the most stores.
Find the store that is closest to this center.
FINDINGS:
The cluster that has the most stores can change at random ...
Considering that cluster centers formations are based randomly-select-and-then-
iterate process, the this means there are no hard answers as to what value will be a
cluster's center. This means that a cluster can have slightly more stores in a cluster
on a random basis.
Subsequent runs of the following code yield varying results:
The McDonalds that is closest to the biggest cluster of 120 clusters is:
"McDonalds-New York,NY","208 Varick St, New York,NY, (212) 206-9991"
The McDonalds that is closest to the biggest cluster of 120 clusters is:
"McDonalds-River Edge,NJ","1118 Main St, River Edge,NJ, (201) 489-9216"
The McDonalds that is closest to the biggest cluster of 120 clusters is:
"McDonalds-The Villages,FL","320 Colony Blvd, The Villages,FL, (352) 753-9225"
The McDonalds that is closest to the biggest cluster of 120 clusters is:
"McDonalds-Englewood,NJ","41 W Palisade Ave, Englewood,NJ, (201) 894-0048"
The McDonalds that is closest to the biggest cluster of 120 clusters is:
"McDonalds-Tarrytown,NY","140 Wildey St, Tarrytown,NY, (914) 631-9620"
*/
import org.apache.spark.mllib.clustering.{KMeans, KMeansModel}
import Math.{PI,cos,sin,sqrt}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.rdd.RDD
import com.github.fommil.netlib.BLAS;
case class CoordCC(x:Double, y:Double, z:Double)
case class IndivMDzCC(info:String, coord:CoordCC)
val radius = 3959.0
val radianPerDegree = PI / 180.0
val pathMD = "./mcdonalds.csv"
val dataMD = sc.textFile(pathMD)
val numClusters = 120 // empirically found
val numIterations = 30
val cnt = dataMD.count
val eachAllDataMD = dataMD.
map(str => {
val latEnd = str.indexOf(",")
val lonEnd = str.indexOf(",", latEnd+1)
val lat = radianPerDegree * str.substring(0, latEnd).toDouble
val lon = radianPerDegree * str.substring(latEnd + 1, lonEnd).toDouble
val tem = radius * cos(lat)
val x = tem * cos(lon)
val y = tem * sin(lon)
val z = radius * sin(lat)
IndivMDzCC(str.substring(lonEnd+1), CoordCC(x,y,z))
}).cache
val eachCoordMD = eachAllDataMD.map(each => Vectors.dense(Array(each.coord.x,each.coord.y,each.coord.z))).cache
val model = KMeans.train(eachCoordMD, numClusters, numIterations)
val centers = Array.ofDim[CoordCC](numClusters)
var i = 0
for(i <- 0 until numClusters) {
centers(i) = CoordCC(model.clusterCenters(i)(0), model.clusterCenters(i)(1), model.clusterCenters(i)(2))
}
val centersRDD = sc.parallelize(Array(centers))
val cart = eachAllDataMD.cartesian(centersRDD)
/*
scala> cart.map(x => x._1.info).collect
res98: Array[String] = Array("McDonalds [WM]-Anchorage,AK","8900 Old Seward Hwy [WM], Anchorage,AK, (907) 344-5831", "McDonalds [WM]-Anchorage,AK","3101 A St [WM], Anchorage,AK, (907) 561-5137", "McDonalds-Anchorage,AK","800 W Northern Lights Blvd, Anchorage,AK, (907) 561-5525", "McDonalds-Anchorage,AK","701 E 36th Ave, Anchorage,AK, (907) 563-0658", "McDonalds-Anchorage,AK","8915 Old Seward Hwy, Anchorage,AK, (907) 344-4231", "McDonalds-Anchorage,AK","1320 Huffman Rd, Anchorage,AK, (907) 345-5932", "McDonalds-Anchorage,AK","2601 E Tudor Rd, Anchorage,AK, (907) 562-2108", "McDonalds-Anchorage,AK","3006 Mountain View Dr, Anchorage,AK, (907) 274-5686", "McDonalds-Anchorage,AK","5716 Debarr Rd, Anchorage,AK, (907) 333-0413", "McDonalds-Anchorage,AK","255 Muldoon Rd, Anchorage,AK, (907) 337...
scala>
cart.map(x => (x._1.info, x._2(0))).collect
res100: Array[(String, CoordCC)] = Array(("McDonalds [WM]-Anchorage,AK","8900 Old Seward Hwy [WM], Anchorage,AK, (907) 344-5831",CoordCC(60.70071076847053,48.11477758734534,-3958.212439647893)), ("McDonalds [WM]-Anchorage,AK","3101 A St [WM], Anchorage,AK, (907) 561-5137",CoordCC(60.70071076847053,48.11477758734534,-3958.212439647893)), ("McDonalds-Anchorage,AK","800 W Northern Lights Blvd, Anchorage,AK, (907) 561-5525",CoordCC(60.70071076847053,48.11477758734534,-3958.212439647893)), ("McDonalds-Anchorage,AK","701 E 36th Ave, Anchorage,AK, (907) 563-0658",CoordCC(60.70071076847053,48.11477758734534,-3958.212439647893)), ("McDonalds-Anchorage,AK","8915 Old Seward Hwy, Anchorage,AK, (907) 344-4231",CoordCC(60.70071076847053,48.11477758734534,-3958.212439647893)), ("McDonalds-Anchorage,AK...
scala>
*/
val rel2Center = cart.map(cartItem => {
var i = 0;
var minDist = radius
var minCenter = 0
for(i <- 0 until cartItem._2.length) {
val dx = cartItem._1.coord.x - cartItem._2(i).x
val dy = cartItem._1.coord.y - cartItem._2(i).y
val dz = cartItem._1.coord.z - cartItem._2(i).z
val dist = Math.sqrt(dx * dx + dy * dy + dz * dz)
if(dist < minDist) {
minDist = dist
minCenter = i
}
}
(cartItem._1, minCenter, minDist)
})
/*
scala> rel2Center.map(store => (store._2, 1)).reduceByKey((a,b) => a + b).collect
res114: Array[(Int, Int)] = Array((96,92), (112,103), (16,447), (80,105), (48,164), (32,46), (0,223), (64,22), (81,102), (97,12), (65,74), (33,113), (113,37), (1,58), (17,478), (49,188), (34,11), (82,90), (66,137), (98,28), (50,4), (18,198), (114,8), (2,196), (19,313), (115,122), (35,105), (51,168), (83,59), (67,75), (3,218), (99,393), (84,101), (100,336), (52,138), (4,21), (116,18), (36,47), (20,34), (68,240), (101,4), (21,98), (53,90), (117,131), (37,228), (69,157), (85,131), (5,29), (22,95), (54,122), (102,9), (118,9), (6,273), (70,39), (38,112), (86,26), (39,4), (119,54), (71,148), (55,86), (23,285), (7,144), (103,21), (87,13), (56,138), (104,109), (24,167), (40,138), (72,56), (8,215), (88,35), (41,290), (25,35), (105,88), (73,28), (57,281), (89,212), (9,149), (42,77), (106,15), (74...
scala>
The McDonalds that is closest to the biggest cluster of 100 clusters is:
"McDonalds PlayPlace-Homer Glen,IL","14298 S Bell Rd, Homer Glen,IL, (708) 301-2332"
*/
val maxCenter = rel2Center.
map(store => (store._2, 1)).
reduceByKey((a,b) => a + b).
map(center => center.swap).
sortByKey(false).
take(1)
printf("Center #%d has %d stores\n", maxCenter(0)._2, maxCenter(0)._1)
val centerMost = rel2Center.
filter(store => store._2 == maxCenter(0)._2).
map(store => (store._3, store._1)).
sortByKey(true).
take(1)
printf("The McDonalds that is closest to the biggest cluster of %d clusters is:\n %s\n",
numClusters, centerMost(0)._2.info )