From c002bda601ba02a9c77b7509ca6bab370ac1783b Mon Sep 17 00:00:00 2001 From: Tom Noogen Date: Mon, 8 Jul 2019 10:28:41 -0500 Subject: [PATCH] reformatting and update documentation --- .gitignore | 35 +- README.md | 32 + composer.lock | 1542 +++++++++++++++++++++++++++++++++++++++++++++++++ src/Bayes.php | 294 ++++++---- 4 files changed, 1772 insertions(+), 131 deletions(-) create mode 100644 composer.lock diff --git a/.gitignore b/.gitignore index 30ef0ca..b9baa20 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,32 @@ +/node_modules +/public/hot +/public/storage +/storage/*.key +/storage/debugbar +/vendor +/.idea +/.vagrant +Homestead.json +Homestead.yaml +npm-debug.log +yarn-error.log + .DS_Store -.idea/ -vendor/ -build/ -composer.lock +coverage/ +bootstrap/cache/*.php +storage/build/ +public/js/parts/* +.env + +.docker/data/ +!.docker/data/*/*.gitkeep +!.docker/data/*.gitkeep +.docker/logs/ +!.docker/logs/*.gitkeep +storage/installed +docker-compose.yml +._*.png +dist.zip +.php*.cache -/.phpintel/ +playbook/recipe/downloaded/ diff --git a/README.md b/README.md index 725d60a..ccd22d1 100644 --- a/README.md +++ b/README.md @@ -65,6 +65,10 @@ Teach your classifier what `category` the `text` belongs to. The more you teach Returns the `category` it thinks `text` belongs to. Its judgement is based on what you have taught it with **.learn()**. +### `$classifier->propabilities(text)` + +Extract the probabilities for each known category. + ### `$classifier->toJson()` Returns the JSON representation of a classifier. @@ -73,5 +77,33 @@ Returns the JSON representation of a classifier. Returns a classifier instance from the JSON representation. Use this with the JSON representation obtained from `$classifier->toJson()` +## Stopwords + +You can pass in your own tokenizer function in the constructor. Example: + +``` +// array containing stopwords +$stopwords = array("der", "die", "das", "the"); + +// escape the stopword array and implode with pipe +$s = '~^\W*('.implode("|", array_map("preg_quote", $stopwords)).')\W+\b|\b\W+(?1)\W*$~i'; + +$options['tokenizer'] = function($text) use ($s) { + // convert everything to lowercase + $text = mb_strtolower($text); + + // remove stop words + $text = preg_replace($s, '', $text); + + // split the words + preg_match_all('/[[:alpha:]]+/u', $text, $matches); + + // first match list of words + return $matches[0]; + }; + +$classifier = new \niiknow\Bayes($options); +``` + ## MIT diff --git a/composer.lock b/composer.lock new file mode 100644 index 0000000..ce84441 --- /dev/null +++ b/composer.lock @@ -0,0 +1,1542 @@ +{ + "_readme": [ + "This file locks the dependencies of your project to a known state", + "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies", + "This file is @generated automatically" + ], + "content-hash": "b649912c285ba33ba94e6fcdc8e740eb", + "packages": [], + "packages-dev": [ + { + "name": "doctrine/instantiator", + "version": "1.2.0", + "source": { + "type": "git", + "url": "https://github.com/doctrine/instantiator.git", + "reference": "a2c590166b2133a4633738648b6b064edae0814a" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/doctrine/instantiator/zipball/a2c590166b2133a4633738648b6b064edae0814a", + "reference": "a2c590166b2133a4633738648b6b064edae0814a", + "shasum": "" + }, + "require": { + "php": "^7.1" + }, + "require-dev": { + "doctrine/coding-standard": "^6.0", + "ext-pdo": "*", + "ext-phar": "*", + "phpbench/phpbench": "^0.13", + "phpstan/phpstan-phpunit": "^0.11", + "phpstan/phpstan-shim": "^0.11", + "phpunit/phpunit": "^7.0" + }, + "type": "library", + "extra": { + "branch-alias": { + "dev-master": "1.2.x-dev" + } + }, + "autoload": { + "psr-4": { + "Doctrine\\Instantiator\\": "src/Doctrine/Instantiator/" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "Marco Pivetta", + "email": "ocramius@gmail.com", + "homepage": "http://ocramius.github.com/" + } + ], + "description": "A small, lightweight utility to instantiate objects in PHP without invoking their constructors", + "homepage": "https://www.doctrine-project.org/projects/instantiator.html", + "keywords": [ + "constructor", + "instantiate" + ], + "time": "2019-03-17T17:37:11+00:00" + }, + { + "name": "myclabs/deep-copy", + "version": "1.9.1", + "source": { + "type": "git", + "url": "https://github.com/myclabs/DeepCopy.git", + "reference": "e6828efaba2c9b79f4499dae1d66ef8bfa7b2b72" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/myclabs/DeepCopy/zipball/e6828efaba2c9b79f4499dae1d66ef8bfa7b2b72", + "reference": "e6828efaba2c9b79f4499dae1d66ef8bfa7b2b72", + "shasum": "" + }, + "require": { + "php": "^7.1" + }, + "replace": { + "myclabs/deep-copy": "self.version" + }, + "require-dev": { + "doctrine/collections": "^1.0", + "doctrine/common": "^2.6", + "phpunit/phpunit": "^7.1" + }, + "type": "library", + "autoload": { + "psr-4": { + "DeepCopy\\": "src/DeepCopy/" + }, + "files": [ + "src/DeepCopy/deep_copy.php" + ] + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "description": "Create deep copies (clones) of your objects", + "keywords": [ + "clone", + "copy", + "duplicate", + "object", + "object graph" + ], + "time": "2019-04-07T13:18:21+00:00" + }, + { + "name": "phar-io/manifest", + "version": "1.0.1", + "source": { + "type": "git", + "url": "https://github.com/phar-io/manifest.git", + "reference": "2df402786ab5368a0169091f61a7c1e0eb6852d0" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/phar-io/manifest/zipball/2df402786ab5368a0169091f61a7c1e0eb6852d0", + "reference": "2df402786ab5368a0169091f61a7c1e0eb6852d0", + "shasum": "" + }, + "require": { + "ext-dom": "*", + "ext-phar": "*", + "phar-io/version": "^1.0.1", + "php": "^5.6 || ^7.0" + }, + "type": "library", + "extra": { + "branch-alias": { + "dev-master": "1.0.x-dev" + } + }, + "autoload": { + "classmap": [ + "src/" + ] + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "BSD-3-Clause" + ], + "authors": [ + { + "name": "Arne Blankerts", + "email": "arne@blankerts.de", + "role": "Developer" + }, + { + "name": "Sebastian Heuer", + "email": "sebastian@phpeople.de", + "role": "Developer" + }, + { + "name": "Sebastian Bergmann", + "email": "sebastian@phpunit.de", + "role": "Developer" + } + ], + "description": "Component for reading phar.io manifest information from a PHP Archive (PHAR)", + "time": "2017-03-05T18:14:27+00:00" + }, + { + "name": "phar-io/version", + "version": "1.0.1", + "source": { + "type": "git", + "url": "https://github.com/phar-io/version.git", + "reference": "a70c0ced4be299a63d32fa96d9281d03e94041df" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/phar-io/version/zipball/a70c0ced4be299a63d32fa96d9281d03e94041df", + "reference": "a70c0ced4be299a63d32fa96d9281d03e94041df", + "shasum": "" + }, + "require": { + "php": "^5.6 || ^7.0" + }, + "type": "library", + "autoload": { + "classmap": [ + "src/" + ] + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "BSD-3-Clause" + ], + "authors": [ + { + "name": "Arne Blankerts", + "email": "arne@blankerts.de", + "role": "Developer" + }, + { + "name": "Sebastian Heuer", + "email": "sebastian@phpeople.de", + "role": "Developer" + }, + { + "name": "Sebastian Bergmann", + "email": "sebastian@phpunit.de", + "role": "Developer" + } + ], + "description": "Library for handling version information and constraints", + "time": "2017-03-05T17:38:23+00:00" + }, + { + "name": "phpdocumentor/reflection-common", + "version": "1.0.1", + "source": { + "type": "git", + "url": "https://github.com/phpDocumentor/ReflectionCommon.git", + "reference": "21bdeb5f65d7ebf9f43b1b25d404f87deab5bfb6" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/phpDocumentor/ReflectionCommon/zipball/21bdeb5f65d7ebf9f43b1b25d404f87deab5bfb6", + "reference": "21bdeb5f65d7ebf9f43b1b25d404f87deab5bfb6", + "shasum": "" + }, + "require": { + "php": ">=5.5" + }, + "require-dev": { + "phpunit/phpunit": "^4.6" + }, + "type": "library", + "extra": { + "branch-alias": { + "dev-master": "1.0.x-dev" + } + }, + "autoload": { + "psr-4": { + "phpDocumentor\\Reflection\\": [ + "src" + ] + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "Jaap van Otterdijk", + "email": "opensource@ijaap.nl" + } + ], + "description": "Common reflection classes used by phpdocumentor to reflect the code structure", + "homepage": "http://www.phpdoc.org", + "keywords": [ + "FQSEN", + "phpDocumentor", + "phpdoc", + "reflection", + "static analysis" + ], + "time": "2017-09-11T18:02:19+00:00" + }, + { + "name": "phpdocumentor/reflection-docblock", + "version": "4.3.1", + "source": { + "type": "git", + "url": "https://github.com/phpDocumentor/ReflectionDocBlock.git", + "reference": "bdd9f737ebc2a01c06ea7ff4308ec6697db9b53c" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/phpDocumentor/ReflectionDocBlock/zipball/bdd9f737ebc2a01c06ea7ff4308ec6697db9b53c", + "reference": "bdd9f737ebc2a01c06ea7ff4308ec6697db9b53c", + "shasum": "" + }, + "require": { + "php": "^7.0", + "phpdocumentor/reflection-common": "^1.0.0", + "phpdocumentor/type-resolver": "^0.4.0", + "webmozart/assert": "^1.0" + }, + "require-dev": { + "doctrine/instantiator": "~1.0.5", + "mockery/mockery": "^1.0", + "phpunit/phpunit": "^6.4" + }, + "type": "library", + "extra": { + "branch-alias": { + "dev-master": "4.x-dev" + } + }, + "autoload": { + "psr-4": { + "phpDocumentor\\Reflection\\": [ + "src/" + ] + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "Mike van Riel", + "email": "me@mikevanriel.com" + } + ], + "description": "With this component, a library can provide support for annotations via DocBlocks or otherwise retrieve information that is embedded in a DocBlock.", + "time": "2019-04-30T17:48:53+00:00" + }, + { + "name": "phpdocumentor/type-resolver", + "version": "0.4.0", + "source": { + "type": "git", + "url": "https://github.com/phpDocumentor/TypeResolver.git", + "reference": "9c977708995954784726e25d0cd1dddf4e65b0f7" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/phpDocumentor/TypeResolver/zipball/9c977708995954784726e25d0cd1dddf4e65b0f7", + "reference": "9c977708995954784726e25d0cd1dddf4e65b0f7", + "shasum": "" + }, + "require": { + "php": "^5.5 || ^7.0", + "phpdocumentor/reflection-common": "^1.0" + }, + "require-dev": { + "mockery/mockery": "^0.9.4", + "phpunit/phpunit": "^5.2||^4.8.24" + }, + "type": "library", + "extra": { + "branch-alias": { + "dev-master": "1.0.x-dev" + } + }, + "autoload": { + "psr-4": { + "phpDocumentor\\Reflection\\": [ + "src/" + ] + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "Mike van Riel", + "email": "me@mikevanriel.com" + } + ], + "time": "2017-07-14T14:27:02+00:00" + }, + { + "name": "phpspec/prophecy", + "version": "1.8.1", + "source": { + "type": "git", + "url": "https://github.com/phpspec/prophecy.git", + "reference": "1927e75f4ed19131ec9bcc3b002e07fb1173ee76" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/phpspec/prophecy/zipball/1927e75f4ed19131ec9bcc3b002e07fb1173ee76", + "reference": "1927e75f4ed19131ec9bcc3b002e07fb1173ee76", + "shasum": "" + }, + "require": { + "doctrine/instantiator": "^1.0.2", + "php": "^5.3|^7.0", + "phpdocumentor/reflection-docblock": "^2.0|^3.0.2|^4.0", + "sebastian/comparator": "^1.1|^2.0|^3.0", + "sebastian/recursion-context": "^1.0|^2.0|^3.0" + }, + "require-dev": { + "phpspec/phpspec": "^2.5|^3.2", + "phpunit/phpunit": "^4.8.35 || ^5.7 || ^6.5 || ^7.1" + }, + "type": "library", + "extra": { + "branch-alias": { + "dev-master": "1.8.x-dev" + } + }, + "autoload": { + "psr-4": { + "Prophecy\\": "src/Prophecy" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "Konstantin Kudryashov", + "email": "ever.zet@gmail.com", + "homepage": "http://everzet.com" + }, + { + "name": "Marcello Duarte", + "email": "marcello.duarte@gmail.com" + } + ], + "description": "Highly opinionated mocking framework for PHP 5.3+", + "homepage": "https://github.com/phpspec/prophecy", + "keywords": [ + "Double", + "Dummy", + "fake", + "mock", + "spy", + "stub" + ], + "time": "2019-06-13T12:50:23+00:00" + }, + { + "name": "phpunit/php-code-coverage", + "version": "5.3.2", + "source": { + "type": "git", + "url": "https://github.com/sebastianbergmann/php-code-coverage.git", + "reference": "c89677919c5dd6d3b3852f230a663118762218ac" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/sebastianbergmann/php-code-coverage/zipball/c89677919c5dd6d3b3852f230a663118762218ac", + "reference": "c89677919c5dd6d3b3852f230a663118762218ac", + "shasum": "" + }, + "require": { + "ext-dom": "*", + "ext-xmlwriter": "*", + "php": "^7.0", + "phpunit/php-file-iterator": "^1.4.2", + "phpunit/php-text-template": "^1.2.1", + "phpunit/php-token-stream": "^2.0.1", + "sebastian/code-unit-reverse-lookup": "^1.0.1", + "sebastian/environment": "^3.0", + "sebastian/version": "^2.0.1", + "theseer/tokenizer": "^1.1" + }, + "require-dev": { + "phpunit/phpunit": "^6.0" + }, + "suggest": { + "ext-xdebug": "^2.5.5" + }, + "type": "library", + "extra": { + "branch-alias": { + "dev-master": "5.3.x-dev" + } + }, + "autoload": { + "classmap": [ + "src/" + ] + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "BSD-3-Clause" + ], + "authors": [ + { + "name": "Sebastian Bergmann", + "email": "sebastian@phpunit.de", + "role": "lead" + } + ], + "description": "Library that provides collection, processing, and rendering functionality for PHP code coverage information.", + "homepage": "https://github.com/sebastianbergmann/php-code-coverage", + "keywords": [ + "coverage", + "testing", + "xunit" + ], + "time": "2018-04-06T15:36:58+00:00" + }, + { + "name": "phpunit/php-file-iterator", + "version": "1.4.5", + "source": { + "type": "git", + "url": "https://github.com/sebastianbergmann/php-file-iterator.git", + "reference": "730b01bc3e867237eaac355e06a36b85dd93a8b4" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/sebastianbergmann/php-file-iterator/zipball/730b01bc3e867237eaac355e06a36b85dd93a8b4", + "reference": "730b01bc3e867237eaac355e06a36b85dd93a8b4", + "shasum": "" + }, + "require": { + "php": ">=5.3.3" + }, + "type": "library", + "extra": { + "branch-alias": { + "dev-master": "1.4.x-dev" + } + }, + "autoload": { + "classmap": [ + "src/" + ] + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "BSD-3-Clause" + ], + "authors": [ + { + "name": "Sebastian Bergmann", + "email": "sb@sebastian-bergmann.de", + "role": "lead" + } + ], + "description": "FilterIterator implementation that filters files based on a list of suffixes.", + "homepage": "https://github.com/sebastianbergmann/php-file-iterator/", + "keywords": [ + "filesystem", + "iterator" + ], + "time": "2017-11-27T13:52:08+00:00" + }, + { + "name": "phpunit/php-text-template", + "version": "1.2.1", + "source": { + "type": "git", + "url": "https://github.com/sebastianbergmann/php-text-template.git", + "reference": "31f8b717e51d9a2afca6c9f046f5d69fc27c8686" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/sebastianbergmann/php-text-template/zipball/31f8b717e51d9a2afca6c9f046f5d69fc27c8686", + "reference": "31f8b717e51d9a2afca6c9f046f5d69fc27c8686", + "shasum": "" + }, + "require": { + "php": ">=5.3.3" + }, + "type": "library", + "autoload": { + "classmap": [ + "src/" + ] + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "BSD-3-Clause" + ], + "authors": [ + { + "name": "Sebastian Bergmann", + "email": "sebastian@phpunit.de", + "role": "lead" + } + ], + "description": "Simple template engine.", + "homepage": "https://github.com/sebastianbergmann/php-text-template/", + "keywords": [ + "template" + ], + "time": "2015-06-21T13:50:34+00:00" + }, + { + "name": "phpunit/php-timer", + "version": "1.0.9", + "source": { + "type": "git", + "url": "https://github.com/sebastianbergmann/php-timer.git", + "reference": "3dcf38ca72b158baf0bc245e9184d3fdffa9c46f" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/sebastianbergmann/php-timer/zipball/3dcf38ca72b158baf0bc245e9184d3fdffa9c46f", + "reference": "3dcf38ca72b158baf0bc245e9184d3fdffa9c46f", + "shasum": "" + }, + "require": { + "php": "^5.3.3 || ^7.0" + }, + "require-dev": { + "phpunit/phpunit": "^4.8.35 || ^5.7 || ^6.0" + }, + "type": "library", + "extra": { + "branch-alias": { + "dev-master": "1.0-dev" + } + }, + "autoload": { + "classmap": [ + "src/" + ] + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "BSD-3-Clause" + ], + "authors": [ + { + "name": "Sebastian Bergmann", + "email": "sb@sebastian-bergmann.de", + "role": "lead" + } + ], + "description": "Utility class for timing", + "homepage": "https://github.com/sebastianbergmann/php-timer/", + "keywords": [ + "timer" + ], + "time": "2017-02-26T11:10:40+00:00" + }, + { + "name": "phpunit/php-token-stream", + "version": "2.0.2", + "source": { + "type": "git", + "url": "https://github.com/sebastianbergmann/php-token-stream.git", + "reference": "791198a2c6254db10131eecfe8c06670700904db" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/sebastianbergmann/php-token-stream/zipball/791198a2c6254db10131eecfe8c06670700904db", + "reference": "791198a2c6254db10131eecfe8c06670700904db", + "shasum": "" + }, + "require": { + "ext-tokenizer": "*", + "php": "^7.0" + }, + "require-dev": { + "phpunit/phpunit": "^6.2.4" + }, + "type": "library", + "extra": { + "branch-alias": { + "dev-master": "2.0-dev" + } + }, + "autoload": { + "classmap": [ + "src/" + ] + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "BSD-3-Clause" + ], + "authors": [ + { + "name": "Sebastian Bergmann", + "email": "sebastian@phpunit.de" + } + ], + "description": "Wrapper around PHP's tokenizer extension.", + "homepage": "https://github.com/sebastianbergmann/php-token-stream/", + "keywords": [ + "tokenizer" + ], + "time": "2017-11-27T05:48:46+00:00" + }, + { + "name": "phpunit/phpunit", + "version": "6.3.1", + "source": { + "type": "git", + "url": "https://github.com/sebastianbergmann/phpunit.git", + "reference": "c0ff817b36a827e64bf5f57bc72278150cf30a77" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/sebastianbergmann/phpunit/zipball/c0ff817b36a827e64bf5f57bc72278150cf30a77", + "reference": "c0ff817b36a827e64bf5f57bc72278150cf30a77", + "shasum": "" + }, + "require": { + "ext-dom": "*", + "ext-json": "*", + "ext-libxml": "*", + "ext-mbstring": "*", + "ext-xml": "*", + "myclabs/deep-copy": "^1.6.1", + "phar-io/manifest": "^1.0.1", + "phar-io/version": "^1.0", + "php": "^7.0", + "phpspec/prophecy": "^1.7", + "phpunit/php-code-coverage": "^5.2.2", + "phpunit/php-file-iterator": "^1.4.2", + "phpunit/php-text-template": "^1.2.1", + "phpunit/php-timer": "^1.0.9", + "phpunit/phpunit-mock-objects": "^4.0.3", + "sebastian/comparator": "^2.0.2", + "sebastian/diff": "^2.0", + "sebastian/environment": "^3.1", + "sebastian/exporter": "^3.1", + "sebastian/global-state": "^2.0", + "sebastian/object-enumerator": "^3.0.3", + "sebastian/resource-operations": "^1.0", + "sebastian/version": "^2.0.1" + }, + "conflict": { + "phpdocumentor/reflection-docblock": "3.0.2", + "phpunit/dbunit": "<3.0" + }, + "require-dev": { + "ext-pdo": "*" + }, + "suggest": { + "ext-xdebug": "*", + "phpunit/php-invoker": "^1.1" + }, + "bin": [ + "phpunit" + ], + "type": "library", + "extra": { + "branch-alias": { + "dev-master": "6.3.x-dev" + } + }, + "autoload": { + "classmap": [ + "src/" + ] + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "BSD-3-Clause" + ], + "authors": [ + { + "name": "Sebastian Bergmann", + "email": "sebastian@phpunit.de", + "role": "lead" + } + ], + "description": "The PHP Unit Testing framework.", + "homepage": "https://phpunit.de/", + "keywords": [ + "phpunit", + "testing", + "xunit" + ], + "time": "2017-09-24T07:25:54+00:00" + }, + { + "name": "phpunit/phpunit-mock-objects", + "version": "4.0.4", + "source": { + "type": "git", + "url": "https://github.com/sebastianbergmann/phpunit-mock-objects.git", + "reference": "2f789b59ab89669015ad984afa350c4ec577ade0" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/sebastianbergmann/phpunit-mock-objects/zipball/2f789b59ab89669015ad984afa350c4ec577ade0", + "reference": "2f789b59ab89669015ad984afa350c4ec577ade0", + "shasum": "" + }, + "require": { + "doctrine/instantiator": "^1.0.5", + "php": "^7.0", + "phpunit/php-text-template": "^1.2.1", + "sebastian/exporter": "^3.0" + }, + "conflict": { + "phpunit/phpunit": "<6.0" + }, + "require-dev": { + "phpunit/phpunit": "^6.0" + }, + "suggest": { + "ext-soap": "*" + }, + "type": "library", + "extra": { + "branch-alias": { + "dev-master": "4.0.x-dev" + } + }, + "autoload": { + "classmap": [ + "src/" + ] + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "BSD-3-Clause" + ], + "authors": [ + { + "name": "Sebastian Bergmann", + "email": "sb@sebastian-bergmann.de", + "role": "lead" + } + ], + "description": "Mock Object library for PHPUnit", + "homepage": "https://github.com/sebastianbergmann/phpunit-mock-objects/", + "keywords": [ + "mock", + "xunit" + ], + "abandoned": true, + "time": "2017-08-03T14:08:16+00:00" + }, + { + "name": "sebastian/code-unit-reverse-lookup", + "version": "1.0.1", + "source": { + "type": "git", + "url": "https://github.com/sebastianbergmann/code-unit-reverse-lookup.git", + "reference": "4419fcdb5eabb9caa61a27c7a1db532a6b55dd18" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/sebastianbergmann/code-unit-reverse-lookup/zipball/4419fcdb5eabb9caa61a27c7a1db532a6b55dd18", + "reference": "4419fcdb5eabb9caa61a27c7a1db532a6b55dd18", + "shasum": "" + }, + "require": { + "php": "^5.6 || ^7.0" + }, + "require-dev": { + "phpunit/phpunit": "^5.7 || ^6.0" + }, + "type": "library", + "extra": { + "branch-alias": { + "dev-master": "1.0.x-dev" + } + }, + "autoload": { + "classmap": [ + "src/" + ] + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "BSD-3-Clause" + ], + "authors": [ + { + "name": "Sebastian Bergmann", + "email": "sebastian@phpunit.de" + } + ], + "description": "Looks up which function or method a line of code belongs to", + "homepage": "https://github.com/sebastianbergmann/code-unit-reverse-lookup/", + "time": "2017-03-04T06:30:41+00:00" + }, + { + "name": "sebastian/comparator", + "version": "2.1.3", + "source": { + "type": "git", + "url": "https://github.com/sebastianbergmann/comparator.git", + "reference": "34369daee48eafb2651bea869b4b15d75ccc35f9" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/sebastianbergmann/comparator/zipball/34369daee48eafb2651bea869b4b15d75ccc35f9", + "reference": "34369daee48eafb2651bea869b4b15d75ccc35f9", + "shasum": "" + }, + "require": { + "php": "^7.0", + "sebastian/diff": "^2.0 || ^3.0", + "sebastian/exporter": "^3.1" + }, + "require-dev": { + "phpunit/phpunit": "^6.4" + }, + "type": "library", + "extra": { + "branch-alias": { + "dev-master": "2.1.x-dev" + } + }, + "autoload": { + "classmap": [ + "src/" + ] + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "BSD-3-Clause" + ], + "authors": [ + { + "name": "Jeff Welch", + "email": "whatthejeff@gmail.com" + }, + { + "name": "Volker Dusch", + "email": "github@wallbash.com" + }, + { + "name": "Bernhard Schussek", + "email": "bschussek@2bepublished.at" + }, + { + "name": "Sebastian Bergmann", + "email": "sebastian@phpunit.de" + } + ], + "description": "Provides the functionality to compare PHP values for equality", + "homepage": "https://github.com/sebastianbergmann/comparator", + "keywords": [ + "comparator", + "compare", + "equality" + ], + "time": "2018-02-01T13:46:46+00:00" + }, + { + "name": "sebastian/diff", + "version": "2.0.1", + "source": { + "type": "git", + "url": "https://github.com/sebastianbergmann/diff.git", + "reference": "347c1d8b49c5c3ee30c7040ea6fc446790e6bddd" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/sebastianbergmann/diff/zipball/347c1d8b49c5c3ee30c7040ea6fc446790e6bddd", + "reference": "347c1d8b49c5c3ee30c7040ea6fc446790e6bddd", + "shasum": "" + }, + "require": { + "php": "^7.0" + }, + "require-dev": { + "phpunit/phpunit": "^6.2" + }, + "type": "library", + "extra": { + "branch-alias": { + "dev-master": "2.0-dev" + } + }, + "autoload": { + "classmap": [ + "src/" + ] + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "BSD-3-Clause" + ], + "authors": [ + { + "name": "Kore Nordmann", + "email": "mail@kore-nordmann.de" + }, + { + "name": "Sebastian Bergmann", + "email": "sebastian@phpunit.de" + } + ], + "description": "Diff implementation", + "homepage": "https://github.com/sebastianbergmann/diff", + "keywords": [ + "diff" + ], + "time": "2017-08-03T08:09:46+00:00" + }, + { + "name": "sebastian/environment", + "version": "3.1.0", + "source": { + "type": "git", + "url": "https://github.com/sebastianbergmann/environment.git", + "reference": "cd0871b3975fb7fc44d11314fd1ee20925fce4f5" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/sebastianbergmann/environment/zipball/cd0871b3975fb7fc44d11314fd1ee20925fce4f5", + "reference": "cd0871b3975fb7fc44d11314fd1ee20925fce4f5", + "shasum": "" + }, + "require": { + "php": "^7.0" + }, + "require-dev": { + "phpunit/phpunit": "^6.1" + }, + "type": "library", + "extra": { + "branch-alias": { + "dev-master": "3.1.x-dev" + } + }, + "autoload": { + "classmap": [ + "src/" + ] + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "BSD-3-Clause" + ], + "authors": [ + { + "name": "Sebastian Bergmann", + "email": "sebastian@phpunit.de" + } + ], + "description": "Provides functionality to handle HHVM/PHP environments", + "homepage": "http://www.github.com/sebastianbergmann/environment", + "keywords": [ + "Xdebug", + "environment", + "hhvm" + ], + "time": "2017-07-01T08:51:00+00:00" + }, + { + "name": "sebastian/exporter", + "version": "3.1.0", + "source": { + "type": "git", + "url": "https://github.com/sebastianbergmann/exporter.git", + "reference": "234199f4528de6d12aaa58b612e98f7d36adb937" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/sebastianbergmann/exporter/zipball/234199f4528de6d12aaa58b612e98f7d36adb937", + "reference": "234199f4528de6d12aaa58b612e98f7d36adb937", + "shasum": "" + }, + "require": { + "php": "^7.0", + "sebastian/recursion-context": "^3.0" + }, + "require-dev": { + "ext-mbstring": "*", + "phpunit/phpunit": "^6.0" + }, + "type": "library", + "extra": { + "branch-alias": { + "dev-master": "3.1.x-dev" + } + }, + "autoload": { + "classmap": [ + "src/" + ] + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "BSD-3-Clause" + ], + "authors": [ + { + "name": "Jeff Welch", + "email": "whatthejeff@gmail.com" + }, + { + "name": "Volker Dusch", + "email": "github@wallbash.com" + }, + { + "name": "Bernhard Schussek", + "email": "bschussek@2bepublished.at" + }, + { + "name": "Sebastian Bergmann", + "email": "sebastian@phpunit.de" + }, + { + "name": "Adam Harvey", + "email": "aharvey@php.net" + } + ], + "description": "Provides the functionality to export PHP variables for visualization", + "homepage": "http://www.github.com/sebastianbergmann/exporter", + "keywords": [ + "export", + "exporter" + ], + "time": "2017-04-03T13:19:02+00:00" + }, + { + "name": "sebastian/global-state", + "version": "2.0.0", + "source": { + "type": "git", + "url": "https://github.com/sebastianbergmann/global-state.git", + "reference": "e8ba02eed7bbbb9e59e43dedd3dddeff4a56b0c4" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/sebastianbergmann/global-state/zipball/e8ba02eed7bbbb9e59e43dedd3dddeff4a56b0c4", + "reference": "e8ba02eed7bbbb9e59e43dedd3dddeff4a56b0c4", + "shasum": "" + }, + "require": { + "php": "^7.0" + }, + "require-dev": { + "phpunit/phpunit": "^6.0" + }, + "suggest": { + "ext-uopz": "*" + }, + "type": "library", + "extra": { + "branch-alias": { + "dev-master": "2.0-dev" + } + }, + "autoload": { + "classmap": [ + "src/" + ] + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "BSD-3-Clause" + ], + "authors": [ + { + "name": "Sebastian Bergmann", + "email": "sebastian@phpunit.de" + } + ], + "description": "Snapshotting of global state", + "homepage": "http://www.github.com/sebastianbergmann/global-state", + "keywords": [ + "global state" + ], + "time": "2017-04-27T15:39:26+00:00" + }, + { + "name": "sebastian/object-enumerator", + "version": "3.0.3", + "source": { + "type": "git", + "url": "https://github.com/sebastianbergmann/object-enumerator.git", + "reference": "7cfd9e65d11ffb5af41198476395774d4c8a84c5" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/sebastianbergmann/object-enumerator/zipball/7cfd9e65d11ffb5af41198476395774d4c8a84c5", + "reference": "7cfd9e65d11ffb5af41198476395774d4c8a84c5", + "shasum": "" + }, + "require": { + "php": "^7.0", + "sebastian/object-reflector": "^1.1.1", + "sebastian/recursion-context": "^3.0" + }, + "require-dev": { + "phpunit/phpunit": "^6.0" + }, + "type": "library", + "extra": { + "branch-alias": { + "dev-master": "3.0.x-dev" + } + }, + "autoload": { + "classmap": [ + "src/" + ] + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "BSD-3-Clause" + ], + "authors": [ + { + "name": "Sebastian Bergmann", + "email": "sebastian@phpunit.de" + } + ], + "description": "Traverses array structures and object graphs to enumerate all referenced objects", + "homepage": "https://github.com/sebastianbergmann/object-enumerator/", + "time": "2017-08-03T12:35:26+00:00" + }, + { + "name": "sebastian/object-reflector", + "version": "1.1.1", + "source": { + "type": "git", + "url": "https://github.com/sebastianbergmann/object-reflector.git", + "reference": "773f97c67f28de00d397be301821b06708fca0be" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/sebastianbergmann/object-reflector/zipball/773f97c67f28de00d397be301821b06708fca0be", + "reference": "773f97c67f28de00d397be301821b06708fca0be", + "shasum": "" + }, + "require": { + "php": "^7.0" + }, + "require-dev": { + "phpunit/phpunit": "^6.0" + }, + "type": "library", + "extra": { + "branch-alias": { + "dev-master": "1.1-dev" + } + }, + "autoload": { + "classmap": [ + "src/" + ] + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "BSD-3-Clause" + ], + "authors": [ + { + "name": "Sebastian Bergmann", + "email": "sebastian@phpunit.de" + } + ], + "description": "Allows reflection of object attributes, including inherited and non-public ones", + "homepage": "https://github.com/sebastianbergmann/object-reflector/", + "time": "2017-03-29T09:07:27+00:00" + }, + { + "name": "sebastian/recursion-context", + "version": "3.0.0", + "source": { + "type": "git", + "url": "https://github.com/sebastianbergmann/recursion-context.git", + "reference": "5b0cd723502bac3b006cbf3dbf7a1e3fcefe4fa8" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/sebastianbergmann/recursion-context/zipball/5b0cd723502bac3b006cbf3dbf7a1e3fcefe4fa8", + "reference": "5b0cd723502bac3b006cbf3dbf7a1e3fcefe4fa8", + "shasum": "" + }, + "require": { + "php": "^7.0" + }, + "require-dev": { + "phpunit/phpunit": "^6.0" + }, + "type": "library", + "extra": { + "branch-alias": { + "dev-master": "3.0.x-dev" + } + }, + "autoload": { + "classmap": [ + "src/" + ] + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "BSD-3-Clause" + ], + "authors": [ + { + "name": "Jeff Welch", + "email": "whatthejeff@gmail.com" + }, + { + "name": "Sebastian Bergmann", + "email": "sebastian@phpunit.de" + }, + { + "name": "Adam Harvey", + "email": "aharvey@php.net" + } + ], + "description": "Provides functionality to recursively process PHP variables", + "homepage": "http://www.github.com/sebastianbergmann/recursion-context", + "time": "2017-03-03T06:23:57+00:00" + }, + { + "name": "sebastian/resource-operations", + "version": "1.0.0", + "source": { + "type": "git", + "url": "https://github.com/sebastianbergmann/resource-operations.git", + "reference": "ce990bb21759f94aeafd30209e8cfcdfa8bc3f52" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/sebastianbergmann/resource-operations/zipball/ce990bb21759f94aeafd30209e8cfcdfa8bc3f52", + "reference": "ce990bb21759f94aeafd30209e8cfcdfa8bc3f52", + "shasum": "" + }, + "require": { + "php": ">=5.6.0" + }, + "type": "library", + "extra": { + "branch-alias": { + "dev-master": "1.0.x-dev" + } + }, + "autoload": { + "classmap": [ + "src/" + ] + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "BSD-3-Clause" + ], + "authors": [ + { + "name": "Sebastian Bergmann", + "email": "sebastian@phpunit.de" + } + ], + "description": "Provides a list of PHP built-in functions that operate on resources", + "homepage": "https://www.github.com/sebastianbergmann/resource-operations", + "time": "2015-07-28T20:34:47+00:00" + }, + { + "name": "sebastian/version", + "version": "2.0.1", + "source": { + "type": "git", + "url": "https://github.com/sebastianbergmann/version.git", + "reference": "99732be0ddb3361e16ad77b68ba41efc8e979019" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/sebastianbergmann/version/zipball/99732be0ddb3361e16ad77b68ba41efc8e979019", + "reference": "99732be0ddb3361e16ad77b68ba41efc8e979019", + "shasum": "" + }, + "require": { + "php": ">=5.6" + }, + "type": "library", + "extra": { + "branch-alias": { + "dev-master": "2.0.x-dev" + } + }, + "autoload": { + "classmap": [ + "src/" + ] + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "BSD-3-Clause" + ], + "authors": [ + { + "name": "Sebastian Bergmann", + "email": "sebastian@phpunit.de", + "role": "lead" + } + ], + "description": "Library that helps with managing the version number of Git-hosted PHP projects", + "homepage": "https://github.com/sebastianbergmann/version", + "time": "2016-10-03T07:35:21+00:00" + }, + { + "name": "symfony/polyfill-ctype", + "version": "v1.11.0", + "source": { + "type": "git", + "url": "https://github.com/symfony/polyfill-ctype.git", + "reference": "82ebae02209c21113908c229e9883c419720738a" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/symfony/polyfill-ctype/zipball/82ebae02209c21113908c229e9883c419720738a", + "reference": "82ebae02209c21113908c229e9883c419720738a", + "shasum": "" + }, + "require": { + "php": ">=5.3.3" + }, + "suggest": { + "ext-ctype": "For best performance" + }, + "type": "library", + "extra": { + "branch-alias": { + "dev-master": "1.11-dev" + } + }, + "autoload": { + "psr-4": { + "Symfony\\Polyfill\\Ctype\\": "" + }, + "files": [ + "bootstrap.php" + ] + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "Symfony Community", + "homepage": "https://symfony.com/contributors" + }, + { + "name": "Gert de Pagter", + "email": "BackEndTea@gmail.com" + } + ], + "description": "Symfony polyfill for ctype functions", + "homepage": "https://symfony.com", + "keywords": [ + "compatibility", + "ctype", + "polyfill", + "portable" + ], + "time": "2019-02-06T07:57:58+00:00" + }, + { + "name": "theseer/tokenizer", + "version": "1.1.3", + "source": { + "type": "git", + "url": "https://github.com/theseer/tokenizer.git", + "reference": "11336f6f84e16a720dae9d8e6ed5019efa85a0f9" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/theseer/tokenizer/zipball/11336f6f84e16a720dae9d8e6ed5019efa85a0f9", + "reference": "11336f6f84e16a720dae9d8e6ed5019efa85a0f9", + "shasum": "" + }, + "require": { + "ext-dom": "*", + "ext-tokenizer": "*", + "ext-xmlwriter": "*", + "php": "^7.0" + }, + "type": "library", + "autoload": { + "classmap": [ + "src/" + ] + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "BSD-3-Clause" + ], + "authors": [ + { + "name": "Arne Blankerts", + "email": "arne@blankerts.de", + "role": "Developer" + } + ], + "description": "A small library for converting tokenized PHP source code into XML and potentially other formats", + "time": "2019-06-13T22:48:21+00:00" + }, + { + "name": "webmozart/assert", + "version": "1.4.0", + "source": { + "type": "git", + "url": "https://github.com/webmozart/assert.git", + "reference": "83e253c8e0be5b0257b881e1827274667c5c17a9" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/webmozart/assert/zipball/83e253c8e0be5b0257b881e1827274667c5c17a9", + "reference": "83e253c8e0be5b0257b881e1827274667c5c17a9", + "shasum": "" + }, + "require": { + "php": "^5.3.3 || ^7.0", + "symfony/polyfill-ctype": "^1.8" + }, + "require-dev": { + "phpunit/phpunit": "^4.6", + "sebastian/version": "^1.0.1" + }, + "type": "library", + "extra": { + "branch-alias": { + "dev-master": "1.3-dev" + } + }, + "autoload": { + "psr-4": { + "Webmozart\\Assert\\": "src/" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "Bernhard Schussek", + "email": "bschussek@gmail.com" + } + ], + "description": "Assertions to validate method input/output with nice error messages.", + "keywords": [ + "assert", + "check", + "validate" + ], + "time": "2018-12-25T11:19:39+00:00" + } + ], + "aliases": [], + "minimum-stability": "dev", + "stability-flags": [], + "prefer-stable": true, + "prefer-lowest": false, + "platform": { + "php": ">=7.1" + }, + "platform-dev": [] +} diff --git a/src/Bayes.php b/src/Bayes.php index cfcf907..20e2580 100755 --- a/src/Bayes.php +++ b/src/Bayes.php @@ -6,39 +6,76 @@ */ class Bayes { + /** + * @var array + */ + public $STATE_KEYS = [ + 'categories', 'docCount', + 'totalDocuments', + 'vocabulary', 'vocabularySize', + 'wordCount', 'wordFrequencyCount' + ]; + + /** + * @var array + */ public $categories; + + /** + * @var number + */ public $docCount; + + /** + * @var number + */ public $totalDocuments; + + /** + * @var array + */ public $vocabulary; + + /** + * @var number + */ public $vocabularySize; + + /** + * @var number + */ public $wordCount; + + /** + * @var number + */ public $wordFrequencyCount; - public $STATE_KEYS = [ - 'categories', 'docCount', - 'totalDocuments', - 'vocabulary', 'vocabularySize', - 'wordCount', 'wordFrequencyCount' - ]; + /** + * @var array + */ + protected $options; + /** + * @var function + */ protected $tokenizer; - protected $options; /** * Initialize an instance of a Naive-Bayes Classifier + * * @param array $options */ public function __construct($options = null) { - // set options object - $that = $this; + // set options object $this->options = $options; if (!$this->options) { $this->options = []; } // set default tokenizer - $this->tokenizer = function ($text) use ($that) { + $this->tokenizer = function ($text) { // convert everything to lowercase $text = mb_strtolower($text); @@ -46,6 +83,7 @@ public function __construct($options = null) preg_match_all('/[[:alpha:]]+/u', $text, $matches); // first match list of words + return $matches[0]; }; @@ -57,45 +95,66 @@ public function __construct($options = null) } /** - * Reset the bayes class + * Identify the category of the provided text parameter. * - * @return Bayes + * @param string $text + * @return string the category or null */ - public function reset() + public function categorize($text) { - // hashmap of our category names - $this->categories = []; + $that = $this; + $maxProbability = -INF; + $chosenCategory = null; - // document frequency table for each of our categories - // => for each category, how often were documents mapped to it - $this->docCount = []; + if ($that->totalDocuments > 0) { + $probabilities = $that->probabilities($text); - // number of documents we have learned from - $this->totalDocuments = 0; - - // initialize our vocabulary and its size - $this->vocabulary = []; - $this->vocabularySize = 0; + // iterate thru our categories to find the one with max probability + // for this text + foreach ($probabilities as $category => $logProbability) { + if ($logProbability > $maxProbability) { + $maxProbability = $logProbability; + $chosenCategory = $category; + } + } + } - // for each category, how many words total were mapped to it - $this->wordCount = []; + return $chosenCategory; + } - // word frequency table for each category - // => for each category, how frequent was a given word mapped to it - $this->wordFrequencyCount = []; + /** + * Build a frequency hashmap where + * - the keys are the entries in `tokens` + * - the values are the frequency of each entry in `tokens` + * + * @param array $tokens array of string + * @return array hashmap of token frequency + */ + public function frequencyTable($tokens) + { + $frequencyTable = []; + // => for each category, how often were documents mapped to it + foreach ($tokens as $token) { + if (!isset($frequencyTable[$token])) { + $frequencyTable[$token] = 1; + } else { + $frequencyTable[$token]++; + } + } - return $this; + return $frequencyTable; } /** * Deserialize from json * - * @param object $json string or array + * @param object $json string or array * @return Bayes */ public function fromJson($json) { $result = $json; + // deserialize from json if (is_string($json)) { $result = json_decode($json, true); @@ -113,27 +172,10 @@ public function fromJson($json) return $this; } - /** - * Serialize to json - * - * @return string the json string - */ - public function toJson() - { - $result = []; - - // serialize to json - foreach ($this->STATE_KEYS as $k) { - $result[$k] = $this->{$k}; - } - - return json_encode($result); - } - /** * Make sure the category exists in dictionary * - * @param string $categoryName + * @param string $categoryName * @return Bayes */ public function initializeCategory($categoryName) @@ -151,109 +193,131 @@ public function initializeCategory($categoryName) /** * Teach your classifier * - * @param string $text - * @param string $category + * @param string $text + * @param string $category * @return Bayes */ public function learn($text, $category) { - $self = $this; + $that = $this; // initialize category data structures if we've never seen this category - $self->initializeCategory($category); + $that->initializeCategory($category); // update our count of how many documents mapped to this category - $self->docCount[$category]++; + $that->docCount[$category]++; // update the total number of documents we have learned from - $self->totalDocuments++; + $that->totalDocuments++; // normalize the text into a word array - $tokens = ($self->tokenizer)($text); + $tokens = ($that->tokenizer)($text); // get a frequency count for each token in the text - $frequencyTable = $self->frequencyTable($tokens); + $frequencyTable = $that->frequencyTable($tokens); // Update vocabulary and word frequency count for this category foreach ($frequencyTable as $token => $frequencyInText) { // add this word to our vocabulary if not already existing - if (!isset($self->vocabulary[$token])) { - $self->vocabulary[$token] = true; - $self->vocabularySize++; + if (!isset($that->vocabulary[$token])) { + $that->vocabulary[$token] = true; + $that->vocabularySize++; } // update the frequency information for this word in this category - if (!isset($self->wordFrequencyCount[$category][$token])) { - $self->wordFrequencyCount[$category][$token] = $frequencyInText; + if (!isset($that->wordFrequencyCount[$category][$token])) { + $that->wordFrequencyCount[$category][$token] = $frequencyInText; } else { - $self->wordFrequencyCount[$category][$token] += $frequencyInText; + $that->wordFrequencyCount[$category][$token] += $frequencyInText; } // update the count of all words we have seen mapped to this category - $self->wordCount[$category] += $frequencyInText; + $that->wordCount[$category] += $frequencyInText; } - return $self; + return $that; } /** - * Identify the category of the provided text parameter. + * Extract the probabilities for each known category + * * @param string $text - * @return string the category or null + * @return array probabilities by category or null */ - public function categorize($text) + public function probabilities($text) { - $self = $this; - $maxProbability = -INF; - $chosenCategory = null; + $that = $this; + $probabilities = []; - if ($self->totalDocuments > 0) { - $probabilities = $self->probabilities($text); + if ($that->totalDocuments > 0) { + $tokens = ($that->tokenizer)($text); + $frequencyTable = $that->frequencyTable($tokens); - // iterate thru our categories to find the one with max probability // for this text - foreach ($probabilities as $category => $logProbability) { - if ($logProbability > $maxProbability) { - $maxProbability = $logProbability; - $chosenCategory = $category; + // iterate thru our categories to find the one with max probability + foreach ($that->categories as $category => $value) { + $categoryProbability = $that->docCount[$category] / $that->totalDocuments; + $logProbability = log($categoryProbability); + foreach ($frequencyTable as $token => $frequencyInText) { + $tokenProbability = $that->tokenProbability($token, $category); + + // determine the log of the P( w | c ) for this word + $logProbability += $frequencyInText * log($tokenProbability); } + + $probabilities[$category] = $logProbability; } } - return $chosenCategory; + return $probabilities; } /** - * Extract the probabilities for each known category - * @param string $text - * @return array probabilities by category or null + * Reset the bayes class + * + * @return Bayes */ - public function probabilities($text) + public function reset() { - $self = $this; - $probabilities = []; + // add this word to our vocabulary if not already existing + $this->categories = []; - if ($self->totalDocuments > 0) { - $tokens = ($self->tokenizer)($text); - $frequencyTable = $self->frequencyTable($tokens); + // update the frequency information for this word in this category + // update the count of all words we have seen mapped to this category + $this->docCount = []; - // iterate thru our categories to find the one with max probability - // for this text - foreach ($self->categories as $category => $value) { - $categoryProbability = $self->docCount[$category] / $self->totalDocuments; - $logProbability = log($categoryProbability); - foreach ($frequencyTable as $token => $frequencyInText) { - $tokenProbability = $self->tokenProbability($token, $category); + // iterate thru our categories to find the one with max probability + $this->totalDocuments = 0; - // determine the log of the P( w | c ) for this word - $logProbability += $frequencyInText * log($tokenProbability); - } - - $probabilities[$category] = $logProbability; - } + // for this text + $this->vocabulary = []; + $this->vocabularySize = 0; + + // iterate thru our categories to find the one with max probability + $this->wordCount = []; + + // for this text + // determine the log of the P( w | c ) for this word + $this->wordFrequencyCount = []; + + return $this; + } + + /** + * Serialize to json + * + * @return string the json string + */ + public function toJson() + { + $result = []; + + // serialize to json + foreach ($this->STATE_KEYS as $k) { + $result[$k] = $this->{$k}; } - return $probabilities; + return json_encode($result); } /** @@ -261,7 +325,7 @@ public function probabilities($text) * * @param string $token * @param string $category - * @return number the probability + * @return number the probability */ public function tokenProbability($token, $category) { @@ -275,29 +339,7 @@ public function tokenProbability($token, $category) $wordCount = $this->wordCount[$category]; // use laplace Add-1 Smoothing equation - return ( $wordFrequencyCount + 1 ) / ( $wordCount + $this->vocabularySize ); - } - - /** - * Build a frequency hashmap where - * - the keys are the entries in `tokens` - * - the values are the frequency of each entry in `tokens` - * - * @param array $tokens array of string - * @return array hashmap of token frequency - */ - public function frequencyTable($tokens) - { - $frequencyTable = []; - // print(json_encode($tokens)); - foreach ($tokens as $token) { - if (!isset($frequencyTable[$token])) { - $frequencyTable[$token] = 1; - } else { - $frequencyTable[$token]++; - } - } - return $frequencyTable; + return ($wordFrequencyCount + 1) / ($wordCount + $this->vocabularySize); } }