Bucket merging and percentileHIgherIsBetter and percentileLowerIsBett…

…er upgrades
lmaccherone · Jun 3, 2013 · dc641ad · dc641ad
1 parent dae4927
commit dc641ad
Show file tree

Hide file tree

Showing 9 changed files with 265 additions and 113 deletions.
diff --git a/.gitignore b/.gitignore
@@ -10,6 +10,7 @@ test/*.map
 test/*.js
 play/*.map
 play/*.js
+play/*.csv
 lumenize.js
 lumenize.map
 deploy

diff --git a/docs/Lumenize-docs/index.html b/docs/Lumenize-docs/index.html
@@ -372,6 +372,7 @@ <h1>Lumenize</h1>
 <h3>Overview, Transform, and Histogram</h3>
 <ul class='links'>
 <li><a href="#!/api/Lumenize" rel="Lumenize" class="docClass">Lumenize</a></li>
+<li><a href="#!/api/Lumenize.histogram" rel="Lumenize.histogram" class="docClass">Lumenize.histogram</a></li>
 </ul>
 </div>
 <div class='middle-column'>

diff --git a/docs/Lumenize-docs/output/Lumenize.histogram.js b/docs/Lumenize-docs/output/Lumenize.histogram.js
diff --git a/docs/Lumenize-docs/source/dataTransform.coffee.html b/docs/Lumenize-docs/source/dataTransform.coffee.html
@@ -186,18 +186,20 @@
   headerLength = rows[0].split(',').length
 
   out = []
-  for row in rows
+  for row, index in rows
     newRow = []
     rawRowArray = row.split(',')
-    if rawRowArray.length isnt headerLength
-      throw new Error('Row length does not match header length.')
-    for c in rawRowArray
-      if asterixForUndefined and c is '*'
-        cValue = undefined
-      else
-        cValue = JSON.parse(c)
-      newRow.push(cValue)
-    out.push(newRow)
+    if rawRowArray.length is headerLength
+      for c in rawRowArray
+        if asterixForUndefined and c is '*'
+          cValue = undefined
+        else
+          cValue = JSON.parse(c)
+        newRow.push(cValue)
+      out.push(newRow)
+    else
+      #      throw new Error('Row length does not match header length.')
+      console.log(&quot;Warning: Skipping row because length does not match header length in row #{index}: #{row}&quot;)
 
   return out
 

diff --git a/docs/Lumenize-docs/source/histogram.coffee.html b/docs/Lumenize-docs/source/histogram.coffee.html
@@ -55,11 +55,7 @@
    *     h = histogram.histogram(grades, 'average')
    * 
    *     console.log(h)
-   *     # [ { index: 0,
-   *     #     startOn: -Infinity,
-   *     #     endBelow: Infinity,
-   *     #     label: 'all',
-   *     #     count: 2 } ]
+   *     # [ { index: 0, startOn: null, endBelow: null, label: 'all', count: 2 } ]
    * 
    * Or, we can just pass in a list of values
    * 
@@ -116,12 +112,12 @@
     lowerBase = firstStartOn
   else
     lowerBase = roundDownToSignificance(min, significance)
-    firstStartOn = -Infinity
+    firstStartOn = null
   if lastEndBelow?
     upperBase = lastEndBelow
   else
     upperBase = roundUpToSignificance(max, significance)
-    lastEndBelow = Infinity
+    lastEndBelow = null
 
   return {values, bucketCount, firstStartOn, lowerBase, lastEndBelow, upperBase}
 
@@ -144,26 +140,18 @@
 
   # first bucket
   bucket = {index: 0, startOn: firstStartOn, endBelow: lastEdge}
-  if firstStartOn is -Infinity
-    bucket.label = &quot;&lt; #{bucket.endBelow}&quot;
-  else
-    bucket.label = &quot;#{bucket.startOn}-#{bucket.endBelow}&quot;
   buckets.push(bucket)
 
   # all the buckets in the middle
   for i in [1..bucketCount - 2]
     edge = lastEdge + bucketSize
-    buckets.push({index: i, startOn: lastEdge, endBelow: edge, label: &quot;#{lastEdge}-#{edge}&quot;})
+    buckets.push({index: i, startOn: lastEdge, endBelow: edge})
     lastEdge = edge
 
   # last bucket
-  if lastEdge &gt;= lastEndBelow
-    throw new Error(&quot;Somehow, the last bucket didn't work out. Try a smaller significance.&quot;)
+  if lastEdge? and lastEndBelow? and lastEdge &gt;= lastEndBelow
+    throw new Error(&quot;Somehow, the last bucket didn't work out. Try a smaller significance. lastEdge: #{lastEdge}  lastEndBelow: #{lastEndBelow}&quot;)
   bucket = {index:bucketCount - 1, startOn: lastEdge, endBelow: lastEndBelow}
-  if lastEndBelow is Infinity
-    bucket.label = &quot;&gt;= #{bucket.startOn}&quot;
-  else
-    bucket.label = &quot;#{bucket.startOn}-#{bucket.endBelow}&quot;
   buckets.push(bucket)
 
   return buckets
@@ -172,36 +160,28 @@
   {values, bucketCount, firstStartOn, lowerBase, lastEndBelow, upperBase} = setParameters(rows, valueField, firstStartOn, lastEndBelow, bucketCount, significance)
 
   if bucketCount &lt; 3
-    bucket = {index: 0, startOn: firstStartOn, endBelow: lastEndBelow, label: 'all'}
+    bucket = {index: 0, startOn: firstStartOn, endBelow: lastEndBelow}
     buckets.push(bucket)
     return buckets
 
   bucketSize = 100 / bucketCount
-  buckets = []  # each row is {index, startOn, endBelow, label} meaning bucket  startOn &lt;= x &lt; endBelow
+  buckets = []  # each row is {index, startOn, endBelow} meaning bucket  startOn &lt;= x &lt; endBelow
 
   # first bucket
   currentBoundary = roundDownToSignificance(functions.percentileCreator(bucketSize)(values), significance)
   bucket = {index: 0, startOn: firstStartOn, endBelow: currentBoundary}
-  if firstStartOn is -Infinity
-    bucket.label = &quot;&lt; #{bucket.endBelow}&quot;
-  else
-    bucket.label = &quot;#{bucket.startOn}-#{bucket.endBelow}&quot;
   buckets.push(bucket)
 
   # all the buckets in the middle
   for i in [1..bucketCount - 2]
     lastBoundary = currentBoundary
     currentBoundary = roundDownToSignificance(functions.percentileCreator(bucketSize * (i + 1))(values), significance)
-    buckets.push({index: i, startOn: lastBoundary, endBelow: currentBoundary, label: &quot;#{lastBoundary}-#{currentBoundary}&quot;})
+    buckets.push({index: i, startOn: lastBoundary, endBelow: currentBoundary})
 
   # last bucket
-  if lastBoundary &gt;= lastEndBelow
+  if lastBoundary? and lastEndBelow? and lastBoundary &gt;= lastEndBelow
     throw new Error(&quot;Somehow, the last bucket didn't work out. Try a different bucketCount.&quot;)
   bucket = {index:bucketCount - 1, startOn: currentBoundary, endBelow: lastEndBelow}
-  if lastEndBelow is Infinity
-    bucket.label = &quot;&gt;= #{bucket.startOn}&quot;
-  else
-    bucket.label = &quot;#{bucket.startOn}-#{bucket.endBelow}&quot;
   buckets.push(bucket)
 
   return buckets
@@ -212,8 +192,9 @@
 </span>   * @method bucketsPercentile
    * @member Lumenize.histogram
    * 
-   * This is a short cut to creating a set of exactly 100 buckets to be used for bucketing (or scoring) values in percentiles.
-   * The index of the bucket is the percentile. Note: You can't score in the 100th percentile because you can't beat your own score.
+   * This is a short cut to creating a set of buckets for &quot;scoring&quot; in percentiles (think standardized testing).
+   * 
+   * Note: You can't score in the 100th percentile because you can't beat your own score.
    * If you have a higher score than anybody else, you didn't beat your own score. So, you aren't better than 100%. If there are
    * less than 100 total scores then you technically can't even be in the 99th percentile. This function is hard-coded
    * to only create 100 buckets. However, if you wanted to calculate fractional percentiles. Say you want to know who
@@ -254,7 +235,7 @@
    * Let's create a little helper function to convert the percentiles to grades. It includes a call to `histogram.bucket`.
    * 
    *     getGrade = (average, buckets) -&gt;
-   *       percentile = histogram.bucket(average, buckets).index
+   *       percentile = histogram.bucket(average, buckets).percentileHigherIsBetter
    *       if percentile &gt;= 90
    *         return 'A'
    *       else if percentile &gt;= 60
@@ -287,21 +268,27 @@
    * 
    * @return {Object[]}
    * 
-   * Returns an Array of Objects (buckets) in the form of {index, startOn, endBelow, label}
+   * Returns an Array of Objects (buckets) in the form of {index, startOn, endBelow, label, percentileHigherIsBetter, percentileLowerIsBetter}
    * 
-   * To convert a value into a percentile call `histogram.bucket(value, bucketsFromCallToBucketsPercentile)`
-   * 
-   * The buckets array that is returned will have these properties:
-   * 
-   * * Each bucket (row) will have these fields {index, startOn, endBelow, label}.
-   * * There will be exactly 100 buckets.
-   * * The index of the first one will be 0.
-   * * The index of the last one will be 99.
-   * * The first startOn will be -Infinity
-   * * The last endBelow will be Infinity.
+   * To convert a value into a percentile call `histogram.bucket(value, bucketsFromCallToBucketsPercentile)` and
+   * then read the percentileHigherIsBetter or percentileLowerIsBetter of the bucket that is returned.
    */
 /* &lt;CoffeeScript&gt;
-  return histogram.buckets(rows, valueField, histogram.bucketsConstantDepth, null, null, null, 100)
+  buckets = histogram.buckets(rows, valueField, histogram.bucketsConstantDepth, null, null, null, 100)
+  percentile = 0
+  for b in buckets
+    if b.matchingRangeIndexEnd?
+      b.percentileHigherIsBetter = b.matchingRangeIndexStart
+      b.percentileLowerIsBetter = 99 - b.matchingRangeIndexEnd
+      percentile = b.matchingRangeIndexEnd
+      delete b.matchingRangeIndexEnd
+      delete b.matchingRangeIndexStart
+    else
+      b.percentileHigherIsBetter = percentile
+      b.percentileLowerIsBetter = 99 - percentile
+    percentile++
+
+  return buckets
 
 histogram.buckets = (rows, valueField, type = histogram.bucketsConstantWidth, significance, firstStartOn, lastEndBelow, bucketCount) -&gt;
 &lt;/CoffeeScript&gt; */
@@ -332,11 +319,61 @@
    * The buckets array that is returned will have these properties:
    * 
    * * Each bucket (row) will have these fields {index, startOn, endBelow, label}.
-   * * If firstStartOn is not provided, it will be -Infinity
-   * * If lastEndBelow is not provided, it will be Infinity.
+   * * Duplicate buckets are merged. When they are merged two fields are added to the resulting merged bucket:
+   *   {matchingRangeIndexStart, matchingRangeIndexEnd} indicating the range that this bucket replaces.
+   * * If firstStartOn is not provided, it will be null indicating -Infinity
+   * * If lastEndBelow is not provided, it will be null indicating Infinity.
    */
 /* &lt;CoffeeScript&gt;
-  return type(rows, valueField, significance, firstStartOn, lastEndBelow, bucketCount)
+  tempBuckets = type(rows, valueField, significance, firstStartOn, lastEndBelow, bucketCount)
+
+#  return tempBuckets
+
+  if tempBuckets.length &lt; 2
+    buckets = tempBuckets
+  else  # merge duplicate buckets
+    buckets = []
+    startOfMatching = tempBuckets[0]
+    gotToEnd = false
+    i = 1
+    while i &lt; tempBuckets.length
+      currentBucket = tempBuckets[i]
+      if startOfMatching.startOn == currentBucket.startOn
+        i++
+        currentBucket = tempBuckets[i]
+        while currentBucket? and startOfMatching.startOn == currentBucket.startOn and startOfMatching.endBelow == currentBucket.endBelow
+          i++
+          currentBucket = tempBuckets[i]
+        if i &gt;= tempBuckets.length - 1
+          currentBucket = tempBuckets[tempBuckets.length - 1]
+          gotToEnd = true
+        startOfMatching.matchingRangeIndexStart = startOfMatching.index
+        startOfMatching.matchingRangeIndexEnd = currentBucket.index
+        startOfMatching.endBelow = currentBucket.endBelow
+        buckets.push(startOfMatching)
+        i++
+        currentBucket = tempBuckets[i]
+      else
+        buckets.push(startOfMatching)
+      startOfMatching = currentBucket
+      i++
+    unless gotToEnd
+      buckets.push(currentBucket)
+
+  # reindex and add labels
+  for bucket, index in buckets
+    bucket.index = index
+#    delete bucket.index
+    if bucket.startOn? and bucket.endBelow?
+      bucket.label = &quot;#{bucket.startOn}-#{bucket.endBelow}&quot;
+    else if bucket.startOn?
+      bucket.label = &quot;&gt;= #{bucket.startOn}&quot;
+    else if bucket.endBelow?
+      bucket.label = &quot;&lt; #{bucket.endBelow}&quot;
+    else
+      bucket.label = &quot;all&quot;
+
+  return buckets
 
 histogram.bucket = (value, buckets) -&gt;
 &lt;/CoffeeScript&gt; */
@@ -360,9 +397,39 @@
    * because you can't beat your own score. This is simlar logic.
    */
 /* &lt;CoffeeScript&gt;
-  for b in buckets
+  unless value?
+    return null
+
+  # middle buckets
+  if buckets.length &gt;= 3
+    for i in [1..buckets.length - 2]
+      b = buckets[i]
+      if b.startOn &lt;= value &lt; b.endBelow
+        return b
+
+  # convoluted logic so it works for buckets of length 1, 2, and 3+
+  b = buckets[0]
+  if b.startOn? and b.endBelow?
+    if b.startOn &lt;= value &lt; b.endBelow
+      return b
+  else if b.startOn?
+    if b.startOn &lt;= value
+      return b
+  else if b.endBelow?
+    if value &lt; b.endBelow
+      return b
+  else if !b.startOn? and !b.endBelow?
+    return b
+
+  # the only situation where you get to this point is when startOn is non-null and it might be the last bucket
+  b = buckets[buckets.length - 1]
+  if b.endBelow?
     if b.startOn &lt;= value &lt; b.endBelow
       return b
+  else
+    if b.startOn &lt;= value
+      return b
+
   return null
 
 histogram.histogramFromBuckets = (rows, valueField, buckets) -&gt;

diff --git a/play/play3.coffee b/play/play3.coffee
@@ -1,12 +1,16 @@
 fs = require('fs')
 {utils, csvString_To_CSVStyleArray, csvStyleArray_To_ArrayOfMaps, functions, histogram} = require('../')
 
-filename = path.join(__dirname, 'bigDump-2013-03-07.csv')
+filename = path.join(__dirname, 'dump-2013-05-28.csv')
 bigDumpCSVString = fs.readFileSync(filename, 'utf8')
 
+console.log('file read')
+
 csvArray = csvString_To_CSVStyleArray(bigDumpCSVString)
 rawData = csvStyleArray_To_ArrayOfMaps(csvArray)
 
+console.log('now in array')
+
 console.time('bucketsPercentile')
 buckets = histogram.bucketsPercentile(rawData, 'FullTimeEquivalent')
 console.timeEnd('bucketsPercentile')

diff --git a/src/dataTransform.coffee b/src/dataTransform.coffee
@@ -159,18 +159,20 @@ csvString_To_CSVStyleArray = (s, asterixForUndefined = true) ->
   headerLength = rows[0].split(',').length
 
   out = []
-  for row in rows
+  for row, index in rows
     newRow = []
     rawRowArray = row.split(',')
-    if rawRowArray.length isnt headerLength
-      throw new Error('Row length does not match header length.')
-    for c in rawRowArray
-      if asterixForUndefined and c is '*'
-        cValue = undefined
-      else
-        cValue = JSON.parse(c)
-      newRow.push(cValue)
-    out.push(newRow)
+    if rawRowArray.length is headerLength
+      for c in rawRowArray
+        if asterixForUndefined and c is '*'
+          cValue = undefined
+        else
+          cValue = JSON.parse(c)
+        newRow.push(cValue)
+      out.push(newRow)
+    else
+      #      throw new Error('Row length does not match header length.')
+      console.log("Warning: Skipping row because length does not match header length in row #{index}: #{row}")
 
   return out