Skip to content

Commit

Permalink
Update project
Browse files Browse the repository at this point in the history
  • Loading branch information
wooorm committed Jan 24, 2020
1 parent 961deb2 commit 8684557
Show file tree
Hide file tree
Showing 14 changed files with 195 additions and 146 deletions.
9 changes: 9 additions & 0 deletions .editorconfig
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
root = true

[*]
indent_style = space
indent_size = 2
end_of_line = lf
charset = utf-8
trim_trailing_whitespace = true
insert_final_newline = true
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,5 @@
node_modules
.DS_Store
*.log
node_modules/
yarn.lock
archive.zip
1 change: 1 addition & 0 deletions .npmrc
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
package-lock=false
4 changes: 4 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
language: node_js
node_js:
- lts/dubnium
- node
68 changes: 0 additions & 68 deletions README.md

This file was deleted.

14 changes: 9 additions & 5 deletions build.js
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,15 @@ var dsv = require('d3-dsv')
var bail = require('bail')

// See: http://crr.ugent.be/programs-data/subtitle-frequencies
var endpoint = 'https://www.ugent.be/pp/experimentele-psychologie/en/research/documents/subtlexus/subtlexus2.zip/at_download/file'
var endpoint =
'https://www.ugent.be/pp/experimentele-psychologie/en/research/documents/subtlexus/subtlexus2.zip/at_download/file'

// Name in archive.
var name = 'SUBTLEXus74286wordstextversion.txt'

https
.request(endpoint, onrequest)
.end()
var found = false

https.request(endpoint, onrequest).end()

function onrequest(res) {
res
Expand Down Expand Up @@ -62,7 +63,10 @@ function onend() {
}

function onconcat(buf) {
var data = dsv.tsvParse(String(buf)).map(map).sort(sort)
var data = dsv
.tsvParse(String(buf))
.map(map)
.sort(sort)

fs.writeFile('index.json', JSON.stringify(data, null, 2) + '\n', bail)
}
Expand Down
Binary file removed data/BrysbaertNew2009.pdf
Binary file not shown.
6 changes: 0 additions & 6 deletions data/readme.txt

This file was deleted.

26 changes: 0 additions & 26 deletions data/sample.txt

This file was deleted.

22 changes: 0 additions & 22 deletions example.js

This file was deleted.

15 changes: 15 additions & 0 deletions license
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
ISC License

Copyright (c) 2015 Zeke Sikelianos <[email protected]>

Permission to use, copy, modify, and/or distribute this software for any
purpose with or without fee is hereby granted, provided that the above
copyright notice and this permission notice appear in all copies.

THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
64 changes: 46 additions & 18 deletions package.json
Original file line number Diff line number Diff line change
@@ -1,32 +1,60 @@
{
"name": "subtlex-word-frequencies",
"version": "1.0.0",
"description": "An array of over 200,000 words sorted by frequency of use in spoken English.",
"main": "index.json",
"scripts": {
"build": "node build",
"test": "standard --format"
},
"repository": {
"type": "git",
"url": "git+https://github.com/zeke/subtlex-word-frequencies.git"
},
"description": "List of 74,286 words sorted by frequency of use in spoken English",
"license": "MIT",
"keywords": [
"subtlex",
"subtitle",
"words",
"language"
"language",
"frequency"
],
"author": "zeke",
"license": "ISC",
"bugs": {
"url": "https://github.com/zeke/subtlex-word-frequencies/issues"
},
"homepage": "https://github.com/zeke/subtlex-word-frequencies#readme",
"repository": "words/subtlex-word-frequencies",
"bugs": "https://github.com/words/subtlex-word-frequencies/issues",
"author": "Zeke Sikelianos <[email protected]> (http://zeke.sikelianos.com)",
"contributors": [
"Zeke Sikelianos <[email protected]> (http://zeke.sikelianos.com)",
"Titus Wormer <[email protected]> (https://wooorm.com)"
],
"main": "index.json",
"files": [
"index.json"
],
"dependencies": {},
"devDependencies": {
"bail": "^1.0.0",
"concat-stream": "^2.0.0",
"d3-dsv": "^1.0.0",
"standard": "^4.5.4",
"prettier": "^1.0.0",
"remark-cli": "^7.0.0",
"remark-preset-wooorm": "^6.0.0",
"tape": "^4.0.0",
"unified": "^8.0.0",
"xo": "^0.25.0",
"yauzl": "^2.10.0"
},
"scripts": {
"generate": "node build",
"format": "remark . -qfo && prettier --write \"**/*.js\" && xo --fix",
"test-api": "node test",
"test": "npm run format && npm run test-api"
},
"prettier": {
"tabWidth": 2,
"useTabs": false,
"singleQuote": true,
"bracketSpacing": false,
"semi": false,
"trailingComma": "none"
},
"xo": {
"prettier": true,
"esnext": false
},
"remarkConfig": {
"plugins": [
"preset-wooorm"
]
}
}
91 changes: 91 additions & 0 deletions readme.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
# `subtlex-word-frequencies`

[![Build][build-badge]][build]
[![Downloads][downloads-badge]][downloads]
[![Size][size-badge]][size]

List of 74,286 words sorted by frequency of use in spoken English.

The word counts are derived from [SUBTLEXus][], a corpus of American English
subtitles of movies.

## Install

[npm][]:

```sh
npm install subtlex-word-frequencies
```

## Use

```js
var subtlex = require('subtlex-word-frequencies')

console.log(words.length)

console.log(words.slice(0, 3))

console.log(words.filter(d => d.word.match(/chick/)).slice(0, 5))
```

Yields:

```js
74286
[
{word: 'you', count: 2134713},
{word: 'I', count: 2038529},
{word: 'the', count: 1501908}
]
[
{word: 'chicken', count: 3148},
{word: 'chick', count: 1334},
{word: 'chicks', count: 742},
{word: 'chickens', count: 520},
{word: 'chickenshit', count: 85}
]
```

## API

### `subtlexWordFrequencies`

`Array.<Entry>` — List of all entries in SUBTLEXus.
Each entry has the following properties:

* `word` (`string`) — Unique word
(example: `git`)
* `value` (`number`) — Number of times the word appears in the corpus
(example: `101`)

`word` starts with a capital when the word more often starts with an uppercase
letter than with a lowercase letter (example: `I`).

The entire original corpus consists of 51 million words.

## License

[ISC][license] © [Zeke Sikelianos][author]

<!-- Definition -->

[build-badge]: https://img.shields.io/travis/words/subtlex-word-frequencies.svg

[build]: https://travis-ci.org/words/subtlex-word-frequencies

[downloads-badge]: https://img.shields.io/npm/dm/subtlex-word-frequencies.svg

[downloads]: https://www.npmjs.com/package/subtlex-word-frequencies

[size-badge]: https://img.shields.io/bundlephobia/minzip/subtlex-word-frequencies.svg

[size]: https://bundlephobia.com/result?p=subtlex-word-frequencies

[npm]: https://docs.npmjs.com/cli/install

[license]: license

[author]: http://zeke.sikelianos.com

[subtlexus]: https://www.ugent.be/pp/experimentele-psychologie/en/research/documents/subtlexus
16 changes: 16 additions & 0 deletions test.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
'use strict'

var test = require('tape')
var subtlex = require('.')

test('subtlex', function(t) {
t.plan(2)

t.ok(Array.isArray(subtlex), 'should be an `array`')

subtlex.forEach(function(d) {
if (d.word === 'right') {
t.deepEqual(d, {word: 'right', count: 204428}, 'should work')
}
})
})

0 comments on commit 8684557

Please sign in to comment.