Skip to content

Commit

Permalink
Bucket merging and percentileHIgherIsBetter and percentileLowerIsBett…
Browse files Browse the repository at this point in the history
…er upgrades
  • Loading branch information
lmaccherone committed Jun 3, 2013
1 parent dae4927 commit dc641ad
Show file tree
Hide file tree
Showing 9 changed files with 265 additions and 113 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ test/*.map
test/*.js
play/*.map
play/*.js
play/*.csv
lumenize.js
lumenize.map
deploy
Expand Down
1 change: 1 addition & 0 deletions docs/Lumenize-docs/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -372,6 +372,7 @@ <h1>Lumenize</h1>
<h3>Overview, Transform, and Histogram</h3>
<ul class='links'>
<li><a href="#!/api/Lumenize" rel="Lumenize" class="docClass">Lumenize</a></li>
<li><a href="#!/api/Lumenize.histogram" rel="Lumenize.histogram" class="docClass">Lumenize.histogram</a></li>
</ul>
</div>
<div class='middle-column'>
Expand Down
2 changes: 1 addition & 1 deletion docs/Lumenize-docs/output/Lumenize.histogram.js

Large diffs are not rendered by default.

22 changes: 12 additions & 10 deletions docs/Lumenize-docs/source/dataTransform.coffee.html
Original file line number Diff line number Diff line change
Expand Up @@ -186,18 +186,20 @@
headerLength = rows[0].split(',').length

out = []
for row in rows
for row, index in rows
newRow = []
rawRowArray = row.split(',')
if rawRowArray.length isnt headerLength
throw new Error('Row length does not match header length.')
for c in rawRowArray
if asterixForUndefined and c is '*'
cValue = undefined
else
cValue = JSON.parse(c)
newRow.push(cValue)
out.push(newRow)
if rawRowArray.length is headerLength
for c in rawRowArray
if asterixForUndefined and c is '*'
cValue = undefined
else
cValue = JSON.parse(c)
newRow.push(cValue)
out.push(newRow)
else
# throw new Error('Row length does not match header length.')
console.log(&quot;Warning: Skipping row because length does not match header length in row #{index}: #{row}&quot;)

return out

Expand Down
165 changes: 116 additions & 49 deletions docs/Lumenize-docs/source/histogram.coffee.html
Original file line number Diff line number Diff line change
Expand Up @@ -55,11 +55,7 @@
* h = histogram.histogram(grades, 'average')
*
* console.log(h)
* # [ { index: 0,
* # startOn: -Infinity,
* # endBelow: Infinity,
* # label: 'all',
* # count: 2 } ]
* # [ { index: 0, startOn: null, endBelow: null, label: 'all', count: 2 } ]
*
* Or, we can just pass in a list of values
*
Expand Down Expand Up @@ -116,12 +112,12 @@
lowerBase = firstStartOn
else
lowerBase = roundDownToSignificance(min, significance)
firstStartOn = -Infinity
firstStartOn = null
if lastEndBelow?
upperBase = lastEndBelow
else
upperBase = roundUpToSignificance(max, significance)
lastEndBelow = Infinity
lastEndBelow = null

return {values, bucketCount, firstStartOn, lowerBase, lastEndBelow, upperBase}

Expand All @@ -144,26 +140,18 @@

# first bucket
bucket = {index: 0, startOn: firstStartOn, endBelow: lastEdge}
if firstStartOn is -Infinity
bucket.label = &quot;&lt; #{bucket.endBelow}&quot;
else
bucket.label = &quot;#{bucket.startOn}-#{bucket.endBelow}&quot;
buckets.push(bucket)

# all the buckets in the middle
for i in [1..bucketCount - 2]
edge = lastEdge + bucketSize
buckets.push({index: i, startOn: lastEdge, endBelow: edge, label: &quot;#{lastEdge}-#{edge}&quot;})
buckets.push({index: i, startOn: lastEdge, endBelow: edge})
lastEdge = edge

# last bucket
if lastEdge &gt;= lastEndBelow
throw new Error(&quot;Somehow, the last bucket didn't work out. Try a smaller significance.&quot;)
if lastEdge? and lastEndBelow? and lastEdge &gt;= lastEndBelow
throw new Error(&quot;Somehow, the last bucket didn't work out. Try a smaller significance. lastEdge: #{lastEdge} lastEndBelow: #{lastEndBelow}&quot;)
bucket = {index:bucketCount - 1, startOn: lastEdge, endBelow: lastEndBelow}
if lastEndBelow is Infinity
bucket.label = &quot;&gt;= #{bucket.startOn}&quot;
else
bucket.label = &quot;#{bucket.startOn}-#{bucket.endBelow}&quot;
buckets.push(bucket)

return buckets
Expand All @@ -172,36 +160,28 @@
{values, bucketCount, firstStartOn, lowerBase, lastEndBelow, upperBase} = setParameters(rows, valueField, firstStartOn, lastEndBelow, bucketCount, significance)

if bucketCount &lt; 3
bucket = {index: 0, startOn: firstStartOn, endBelow: lastEndBelow, label: 'all'}
bucket = {index: 0, startOn: firstStartOn, endBelow: lastEndBelow}
buckets.push(bucket)
return buckets

bucketSize = 100 / bucketCount
buckets = [] # each row is {index, startOn, endBelow, label} meaning bucket startOn &lt;= x &lt; endBelow
buckets = [] # each row is {index, startOn, endBelow} meaning bucket startOn &lt;= x &lt; endBelow

# first bucket
currentBoundary = roundDownToSignificance(functions.percentileCreator(bucketSize)(values), significance)
bucket = {index: 0, startOn: firstStartOn, endBelow: currentBoundary}
if firstStartOn is -Infinity
bucket.label = &quot;&lt; #{bucket.endBelow}&quot;
else
bucket.label = &quot;#{bucket.startOn}-#{bucket.endBelow}&quot;
buckets.push(bucket)

# all the buckets in the middle
for i in [1..bucketCount - 2]
lastBoundary = currentBoundary
currentBoundary = roundDownToSignificance(functions.percentileCreator(bucketSize * (i + 1))(values), significance)
buckets.push({index: i, startOn: lastBoundary, endBelow: currentBoundary, label: &quot;#{lastBoundary}-#{currentBoundary}&quot;})
buckets.push({index: i, startOn: lastBoundary, endBelow: currentBoundary})

# last bucket
if lastBoundary &gt;= lastEndBelow
if lastBoundary? and lastEndBelow? and lastBoundary &gt;= lastEndBelow
throw new Error(&quot;Somehow, the last bucket didn't work out. Try a different bucketCount.&quot;)
bucket = {index:bucketCount - 1, startOn: currentBoundary, endBelow: lastEndBelow}
if lastEndBelow is Infinity
bucket.label = &quot;&gt;= #{bucket.startOn}&quot;
else
bucket.label = &quot;#{bucket.startOn}-#{bucket.endBelow}&quot;
buckets.push(bucket)

return buckets
Expand All @@ -212,8 +192,9 @@
</span> * @method bucketsPercentile
* @member Lumenize.histogram
*
* This is a short cut to creating a set of exactly 100 buckets to be used for bucketing (or scoring) values in percentiles.
* The index of the bucket is the percentile. Note: You can't score in the 100th percentile because you can't beat your own score.
* This is a short cut to creating a set of buckets for &quot;scoring&quot; in percentiles (think standardized testing).
*
* Note: You can't score in the 100th percentile because you can't beat your own score.
* If you have a higher score than anybody else, you didn't beat your own score. So, you aren't better than 100%. If there are
* less than 100 total scores then you technically can't even be in the 99th percentile. This function is hard-coded
* to only create 100 buckets. However, if you wanted to calculate fractional percentiles. Say you want to know who
Expand Down Expand Up @@ -254,7 +235,7 @@
* Let's create a little helper function to convert the percentiles to grades. It includes a call to `histogram.bucket`.
*
* getGrade = (average, buckets) -&gt;
* percentile = histogram.bucket(average, buckets).index
* percentile = histogram.bucket(average, buckets).percentileHigherIsBetter
* if percentile &gt;= 90
* return 'A'
* else if percentile &gt;= 60
Expand Down Expand Up @@ -287,21 +268,27 @@
*
* @return {Object[]}
*
* Returns an Array of Objects (buckets) in the form of {index, startOn, endBelow, label}
* Returns an Array of Objects (buckets) in the form of {index, startOn, endBelow, label, percentileHigherIsBetter, percentileLowerIsBetter}
*
* To convert a value into a percentile call `histogram.bucket(value, bucketsFromCallToBucketsPercentile)`
*
* The buckets array that is returned will have these properties:
*
* * Each bucket (row) will have these fields {index, startOn, endBelow, label}.
* * There will be exactly 100 buckets.
* * The index of the first one will be 0.
* * The index of the last one will be 99.
* * The first startOn will be -Infinity
* * The last endBelow will be Infinity.
* To convert a value into a percentile call `histogram.bucket(value, bucketsFromCallToBucketsPercentile)` and
* then read the percentileHigherIsBetter or percentileLowerIsBetter of the bucket that is returned.
*/
/* &lt;CoffeeScript&gt;
return histogram.buckets(rows, valueField, histogram.bucketsConstantDepth, null, null, null, 100)
buckets = histogram.buckets(rows, valueField, histogram.bucketsConstantDepth, null, null, null, 100)
percentile = 0
for b in buckets
if b.matchingRangeIndexEnd?
b.percentileHigherIsBetter = b.matchingRangeIndexStart
b.percentileLowerIsBetter = 99 - b.matchingRangeIndexEnd
percentile = b.matchingRangeIndexEnd
delete b.matchingRangeIndexEnd
delete b.matchingRangeIndexStart
else
b.percentileHigherIsBetter = percentile
b.percentileLowerIsBetter = 99 - percentile
percentile++

return buckets

histogram.buckets = (rows, valueField, type = histogram.bucketsConstantWidth, significance, firstStartOn, lastEndBelow, bucketCount) -&gt;
&lt;/CoffeeScript&gt; */
Expand Down Expand Up @@ -332,11 +319,61 @@
* The buckets array that is returned will have these properties:
*
* * Each bucket (row) will have these fields {index, startOn, endBelow, label}.
* * If firstStartOn is not provided, it will be -Infinity
* * If lastEndBelow is not provided, it will be Infinity.
* * Duplicate buckets are merged. When they are merged two fields are added to the resulting merged bucket:
* {matchingRangeIndexStart, matchingRangeIndexEnd} indicating the range that this bucket replaces.
* * If firstStartOn is not provided, it will be null indicating -Infinity
* * If lastEndBelow is not provided, it will be null indicating Infinity.
*/
/* &lt;CoffeeScript&gt;
return type(rows, valueField, significance, firstStartOn, lastEndBelow, bucketCount)
tempBuckets = type(rows, valueField, significance, firstStartOn, lastEndBelow, bucketCount)

# return tempBuckets

if tempBuckets.length &lt; 2
buckets = tempBuckets
else # merge duplicate buckets
buckets = []
startOfMatching = tempBuckets[0]
gotToEnd = false
i = 1
while i &lt; tempBuckets.length
currentBucket = tempBuckets[i]
if startOfMatching.startOn == currentBucket.startOn
i++
currentBucket = tempBuckets[i]
while currentBucket? and startOfMatching.startOn == currentBucket.startOn and startOfMatching.endBelow == currentBucket.endBelow
i++
currentBucket = tempBuckets[i]
if i &gt;= tempBuckets.length - 1
currentBucket = tempBuckets[tempBuckets.length - 1]
gotToEnd = true
startOfMatching.matchingRangeIndexStart = startOfMatching.index
startOfMatching.matchingRangeIndexEnd = currentBucket.index
startOfMatching.endBelow = currentBucket.endBelow
buckets.push(startOfMatching)
i++
currentBucket = tempBuckets[i]
else
buckets.push(startOfMatching)
startOfMatching = currentBucket
i++
unless gotToEnd
buckets.push(currentBucket)

# reindex and add labels
for bucket, index in buckets
bucket.index = index
# delete bucket.index
if bucket.startOn? and bucket.endBelow?
bucket.label = &quot;#{bucket.startOn}-#{bucket.endBelow}&quot;
else if bucket.startOn?
bucket.label = &quot;&gt;= #{bucket.startOn}&quot;
else if bucket.endBelow?
bucket.label = &quot;&lt; #{bucket.endBelow}&quot;
else
bucket.label = &quot;all&quot;

return buckets

histogram.bucket = (value, buckets) -&gt;
&lt;/CoffeeScript&gt; */
Expand All @@ -360,9 +397,39 @@
* because you can't beat your own score. This is simlar logic.
*/
/* &lt;CoffeeScript&gt;
for b in buckets
unless value?
return null

# middle buckets
if buckets.length &gt;= 3
for i in [1..buckets.length - 2]
b = buckets[i]
if b.startOn &lt;= value &lt; b.endBelow
return b

# convoluted logic so it works for buckets of length 1, 2, and 3+
b = buckets[0]
if b.startOn? and b.endBelow?
if b.startOn &lt;= value &lt; b.endBelow
return b
else if b.startOn?
if b.startOn &lt;= value
return b
else if b.endBelow?
if value &lt; b.endBelow
return b
else if !b.startOn? and !b.endBelow?
return b

# the only situation where you get to this point is when startOn is non-null and it might be the last bucket
b = buckets[buckets.length - 1]
if b.endBelow?
if b.startOn &lt;= value &lt; b.endBelow
return b
else
if b.startOn &lt;= value
return b

return null

histogram.histogramFromBuckets = (rows, valueField, buckets) -&gt;
Expand Down
6 changes: 5 additions & 1 deletion play/play3.coffee
Original file line number Diff line number Diff line change
@@ -1,12 +1,16 @@
fs = require('fs')
{utils, csvString_To_CSVStyleArray, csvStyleArray_To_ArrayOfMaps, functions, histogram} = require('../')

filename = path.join(__dirname, 'bigDump-2013-03-07.csv')
filename = path.join(__dirname, 'dump-2013-05-28.csv')
bigDumpCSVString = fs.readFileSync(filename, 'utf8')

console.log('file read')

csvArray = csvString_To_CSVStyleArray(bigDumpCSVString)
rawData = csvStyleArray_To_ArrayOfMaps(csvArray)

console.log('now in array')

console.time('bucketsPercentile')
buckets = histogram.bucketsPercentile(rawData, 'FullTimeEquivalent')
console.timeEnd('bucketsPercentile')
Expand Down
22 changes: 12 additions & 10 deletions src/dataTransform.coffee
Original file line number Diff line number Diff line change
Expand Up @@ -159,18 +159,20 @@ csvString_To_CSVStyleArray = (s, asterixForUndefined = true) ->
headerLength = rows[0].split(',').length

out = []
for row in rows
for row, index in rows
newRow = []
rawRowArray = row.split(',')
if rawRowArray.length isnt headerLength
throw new Error('Row length does not match header length.')
for c in rawRowArray
if asterixForUndefined and c is '*'
cValue = undefined
else
cValue = JSON.parse(c)
newRow.push(cValue)
out.push(newRow)
if rawRowArray.length is headerLength
for c in rawRowArray
if asterixForUndefined and c is '*'
cValue = undefined
else
cValue = JSON.parse(c)
newRow.push(cValue)
out.push(newRow)
else
# throw new Error('Row length does not match header length.')
console.log("Warning: Skipping row because length does not match header length in row #{index}: #{row}")

return out

Expand Down
Loading

0 comments on commit dc641ad

Please sign in to comment.