diff --git a/.wp-env.json b/.wp-env.json index d3be216b9..1fb420e93 100644 --- a/.wp-env.json +++ b/.wp-env.json @@ -1,5 +1,5 @@ { - "plugins": [".", "./tests/test-plugin", "https://downloads.wordpress.org/plugin/classic-editor.zip"], + "plugins": [".", "./tests/test-plugin", "https://downloads.wordpress.org/plugin/classic-editor.zip", "https://downloads.wordpress.org/plugin/elasticpress.zip"], "env": { "tests": { "mappings": { diff --git a/README.md b/README.md index aaab050ce..be7c6cf55 100644 --- a/README.md +++ b/README.md @@ -24,6 +24,7 @@ Tap into leading cloud-based services like [OpenAI](https://openai.com/), [Micro * Moderate incoming comments for sensitive content using [OpenAI's Moderation API](https://platform.openai.com/docs/guides/moderation) * Convert text content into audio and output a "read-to-me" feature on the front-end to play this audio using [Microsoft Azure's Text to Speech API](https://learn.microsoft.com/en-us/azure/cognitive-services/speech-service/text-to-speech), [Amazon Polly](https://aws.amazon.com/polly/) or [OpenAI's Text to Speech API](https://platform.openai.com/docs/guides/text-to-speech) * Classify post content using [IBM Watson's Natural Language Understanding API](https://www.ibm.com/watson/services/natural-language-understanding/), [OpenAI's Embedding API](https://platform.openai.com/docs/guides/embeddings) or [Microsoft Azure's OpenAI service](https://azure.microsoft.com/en-us/products/ai-services/openai-service) +* Create a smart 404 page that has a recommended results section that suggests relevant content to the user based on the page URL they were trying to access using either [OpenAI's Embedding API](https://platform.openai.com/docs/guides/embeddings) or [Microsoft Azure's OpenAI service](https://azure.microsoft.com/en-us/products/ai-services/openai-service) in combination with [ElasticPress](https://github.com/10up/ElasticPress) * BETA: Recommend content based on overall site traffic via [Microsoft Azure's AI Personalizer API](https://azure.microsoft.com/en-us/services/cognitive-services/personalizer/) *(note that this service has been [deprecated by Microsoft](https://learn.microsoft.com/en-us/azure/ai-services/personalizer/) and as such, will no longer work. We are looking to replace this with a new provider to maintain the same functionality (see [issue#392](https://github.com/10up/classifai/issues/392))* * Generate image alt text, image tags, and smartly crop images using [Microsoft Azure's AI Vision API](https://azure.microsoft.com/en-us/services/cognitive-services/computer-vision/) * Scan images and PDF files for embedded text and save for use in post meta using [Microsoft Azure's AI Vision API](https://azure.microsoft.com/en-us/services/cognitive-services/computer-vision/) @@ -55,6 +56,7 @@ Tap into leading cloud-based services like [OpenAI](https://openai.com/), [Micro * To utilize the Azure OpenAI Language Processing functionality, you will need an active [Microsoft Azure](https://signup.azure.com/signup) account and you will need to [apply](https://aka.ms/oai/access) for OpenAI access. * To utilize the Google Gemini Language Processing functionality, you will need an active [Google Gemini](https://ai.google.dev/tutorials/setup) account. * To utilize the AWS Language Processing functionality, you will need an active [AWS](https://console.aws.amazon.com/) account. +* To utilize the Smart 404 feature, you will need to use [ElasticPress](https://github.com/10up/ElasticPress) 5.0.0+ and [Elasticsearch](https://www.elastic.co/elasticsearch) 7.0+. ## Pricing @@ -111,10 +113,10 @@ Add this repository to composer.json, specifying a release version, as shown bel "type": "package", "package": { "name": "10up/classifai", - "version": "2.0.0", + "version": "3.1.1", "type": "wordpress-plugin", "dist": { - "url": "https://github.com/10up/classifai/archive/refs/tags/2.0.0.zip", + "url": "https://github.com/10up/classifai/archive/refs/tags/3.1.1.zip", "type": "zip" } } @@ -126,7 +128,7 @@ Finally, require the plugin, using the version number you specified in the previ ```json "require": { - "10up/classifai": "3.0.0" + "10up/classifai": "3.1.1" } ``` @@ -440,6 +442,125 @@ Note that [OpenAI](https://platform.openai.com/docs/guides/speech-to-text) can c * Click the button to preview the generated speech audio for the post. * View the post on the front-end and see a read-to-me feature has been added +## Set Up the Smart 404 Feature + +### 1. Decide on Provider + +* This Feature is powered by either OpenAI or Azure OpenAI. +* Once you've chosen a Provider, you'll need to create an account and get authentication details. + * When setting things up on the Azure side, ensure you choose either the `text-embedding-3-small` or `text-embedding-3-large` model. The Feature will not work with other models. + +### 2. Configure Settings under Tools > ClassifAI > Language Processing > Smart 404 + +* Select the proper Provider in the provider dropdown. +* Enter your authentication details. +* Configure any other settings as desired. + +### 3. ElasticPress configuration + +Once the Smart 404 Feature is configured, you can then proceed to get ElasticPress set up to index the data. + +If on a standard WordPress installation: + +* Install and activate the [ElasticPress](https://github.com/10up/elasticpress) plugin. +* Set your Elasticsearch URL in the ElasticPress settings (`ElasticPress > Settings`). +* Go to the `ElasticPress > Sync` settings page and trigger a sync, ensuring this is set to run a sync from scratch. This will send over the new schema to Elasticsearch and index all content, including creating vector embeddings for each post. + +If on a WordPress VIP hosted environment: + +* [Enable Enterprise Search](https://docs.wpvip.com/enterprise-search/enable/) +* [Run the VIP-CLI `index` command](https://docs.wpvip.com/enterprise-search/index/). This sends the new schema to Elasticsearch and indexes all content, including creating vector embeddings for each post. Note you may need to use the `--setup` flag to ensure the schema is created correctly. + +At this point all of your content should be indexed, along with the embeddings data. You'll then need to update your 404 template to display the recommended results. + +### 4. Display the recommended results + +The Smart 404 Feature comes with a few helper functions that can be used to display the recommended results on your 404 page: + +* Directly display the results using the `Classifai\render_smart_404_results()` function. +* Get the data and then display it in your own way using the `Classifai\get_smart_404_results()` function. + +You will need to directly integrate these functions into your 404 template where desired. The plugin does not automatically display the results on the 404 page for you. + +Both functions support the following arguments. If any argument is not provided, the default value set on the settings page will be used: + +* `$index` (string) - The ElasticPress index to search in. Default is `post`. +* `$num` (int) - Maximum number of results to display. Default is `5`. +* `$num_candidates` (int) - Maximum number of results to search over. Default is `5000`. +* `$rescore` (bool) - Whether to run a rescore query or not. Can give better results but often is slower. Default is `false`. +* `$score_function` (string) - The [vector scoring function](https://www.elastic.co/guide/en/elasticsearch/reference/7.17/query-dsl-script-score-query.html#vector-functions) to use. Default is `cosine`. Options are `cosine`, `dot_product`, `l1_norm` and `l2_norm`. + +The `Classifai\render_smart_404_results()` function also supports the following additional arguments: + +* `$fallback` (bool) - Whether to run a fallback WordPress query if no results are found in Elasticsearch. These results will then be rendered. Default is `true`. + +Examples: + +```php +// Render the results. +Classifai\render_smart_404_results( + [ + 'index' => 'post', + 'num' => 3, + 'num_candidates' => 1000, + 'rescore' => true, + 'fallback' => true, + 'score_function' => 'dot_product', + ] +); +``` + +```php +// Get the results. +$results = Classifai\get_smart_404_results( + [ + 'index' => 'post', + 'num' => 10, + 'num_candidates' => 8000, + 'rescore' => false, + 'score_function' => 'cosine', + ] +); + +ob_start(); + +// Render the results. +foreach ( $results as $result ) { +?> +
+ ID ) ) : ?> +
+ + ID ) ); ?> + +
+ + + post_title ); ?> + +
+
+ +
+
diff --git a/includes/Classifai/Features/Smart404.php b/includes/Classifai/Features/Smart404.php new file mode 100644 index 000000000..08fe1797d --- /dev/null +++ b/includes/Classifai/Features/Smart404.php @@ -0,0 +1,362 @@ +label = __( 'Smart 404', 'classifai' ); + + // Contains all providers that are registered to the service. + $this->provider_instances = $this->get_provider_instances( LanguageProcessing::get_service_providers() ); + + // Contains just the providers this feature supports. + $this->supported_providers = [ + OpenAIEmbeddings::ID => __( 'OpenAI Embeddings', 'classifai' ), + AzureEmbeddings::ID => __( 'Azure OpenAI Embeddings', 'classifai' ), + ]; + } + + /** + * Setup any needed integrations. + * + * This will always fire even if the Feature is not enabled + * so we add our own check. + */ + public function setup() { + // Ensure ElasticPress is installed before we proceed. + if ( ! is_elasticpress_installed() ) { + $warning_notice_func = function ( $current_feature ) { + if ( self::ID !== $current_feature ) { + return; + } + + echo ''; + ?> +

+

+

+ is_configured() && $this->is_enabled() ) { + $integration = new Smart404EPIntegration( $this->get_feature_provider_instance() ); + $integration->init(); + } + } + + /** + * Get the description for the enable field. + * + * @return string + */ + public function get_enable_description(): string { + return esc_html__( 'Enable Smart 404 functionality.', 'classifai' ); + } + + /** + * Add any needed custom fields. + */ + public function add_custom_settings_fields() { + $settings = $this->get_settings(); + + add_settings_field( + 'num', + esc_html__( 'Number of posts to show', 'classifai' ), + [ $this, 'render_input' ], + $this->get_option_name(), + $this->get_option_name() . '_section', + [ + 'label_for' => 'num', + 'input_type' => 'number', + 'min' => 1, + 'step' => 1, + 'default_value' => $settings['num'], + 'description' => __( 'Determines the maximum number of posts that will show on a 404 page. This can be overridden in the display functions.', 'classifai' ), + ] + ); + + add_settings_field( + 'num_search', + esc_html__( 'Number of posts to search', 'classifai' ), + [ $this, 'render_input' ], + $this->get_option_name(), + $this->get_option_name() . '_section', + [ + 'label_for' => 'num_search', + 'input_type' => 'number', + 'min' => 1, + 'step' => 1, + 'default_value' => $settings['num_search'], + 'description' => __( 'Determines the maximum number of posts Elasticsearch will use for the vector search. A higher number can give more accurate results but will be slower. This can be overridden in the display functions.', 'classifai' ), + ] + ); + + add_settings_field( + 'threshold', + esc_html__( 'Threshold', 'classifai' ), + [ $this, 'render_input' ], + $this->get_option_name(), + $this->get_option_name() . '_section', + [ + 'label_for' => 'threshold', + 'input_type' => 'number', + 'min' => 0, + 'step' => 0.01, + 'default_value' => $settings['threshold'], + 'description' => __( 'Set the minimum threshold we want for our results. Any result that falls below this number will be automatically removed.', 'classifai' ), + ] + ); + + add_settings_field( + 'rescore', + esc_html__( 'Use rescore query', 'classifai' ), + [ $this, 'render_input' ], + $this->get_option_name(), + $this->get_option_name() . '_section', + [ + 'label_for' => 'rescore', + 'input_type' => 'checkbox', + 'default_value' => $settings['rescore'], + 'description' => __( 'Will run a normal Elasticsearch query and then rescore those results using a vector query. Can give better results but often results in worse performance. This can be overridden in the display functions', 'classifai' ), + ] + ); + + add_settings_field( + 'fallback', + esc_html__( 'Use fallback results', 'classifai' ), + [ $this, 'render_input' ], + $this->get_option_name(), + $this->get_option_name() . '_section', + [ + 'label_for' => 'fallback', + 'input_type' => 'checkbox', + 'default_value' => $settings['fallback'], + 'description' => __( 'If no results are found in Elasticsearch, will fallback to displaying most recent results from WordPress. This can be overridden in the display functions', 'classifai' ), + ] + ); + + add_settings_field( + 'score_function', + esc_html__( 'Score function', 'classifai' ), + [ $this, 'render_select' ], + $this->get_option_name(), + $this->get_option_name() . '_section', + [ + 'label_for' => 'score_function', + 'options' => [ + 'cosine' => __( 'Cosine', 'classifai' ), + 'dot_product' => __( 'Dot Product', 'classifai' ), + 'l1_norm' => __( 'L1 Norm', 'classifai' ), + 'l2_norm' => __( 'L2 Norm', 'classifai' ), + ], + 'default_value' => $settings['score_function'], + 'description' => __( 'Choose which vector scoring function you want to use. You may need to adjust the threshold if you change this. This can be overridden in the display functions', 'classifai' ), + ] + ); + } + + /** + * Returns the default settings for the Feature. + * + * @return array + */ + public function get_feature_default_settings(): array { + return [ + 'provider' => OpenAIEmbeddings::ID, + 'num' => 3, + 'num_search' => 5000, + 'threshold' => 2.35, + 'rescore' => 0, + 'fallback' => 1, + 'score_function' => 'cosine', + ]; + } + + /** + * Sanitizes the default feature settings. + * + * @param array $new_settings Settings being saved. + * @return array + */ + public function sanitize_default_feature_settings( array $new_settings ): array { + $settings = $this->get_settings(); + + $new_settings['num'] = absint( $new_settings['num'] ?? $settings['num'] ); + $new_settings['num_search'] = absint( $new_settings['num_search'] ?? $settings['num_search'] ); + $new_settings['threshold'] = floatval( $new_settings['threshold'] ?? $settings['threshold'] ); + + if ( empty( $new_settings['rescore'] ) || 1 !== (int) $new_settings['rescore'] ) { + $new_settings['rescore'] = 'no'; + } else { + $new_settings['rescore'] = '1'; + } + + if ( empty( $new_settings['fallback'] ) || 1 !== (int) $new_settings['fallback'] ) { + $new_settings['fallback'] = 'no'; + } else { + $new_settings['fallback'] = '1'; + } + + if ( isset( $new_settings['score_function'] ) && in_array( $new_settings['score_function'], [ 'cosine', 'dot_product', 'l1_norm', 'l2_norm' ], true ) ) { + $new_settings['score_function'] = sanitize_text_field( $new_settings['score_function'] ); + } else { + $new_settings['score_function'] = 'cosine'; + } + + return $new_settings; + } + + /** + * Run an exact k-NN search. + * + * @param string $query Query to search for. + * @param array $args Arguments to pass to the search. + * @return array|WP_Error + */ + public function exact_knn_search( string $query, array $args = [] ) { + // Ensure the Feature is enabled and configured before trying to use it. + if ( ! is_elasticpress_installed() || ! $this->is_configured() || ! $this->is_enabled() ) { + return new WP_Error( 'not_enabled', __( 'Feature is not enabled.', 'classifai' ) ); + } + + // Ensure we have a query. + if ( empty( $query ) ) { + return new WP_Error( 'no_query', __( 'No query provided.', 'classifai' ) ); + } + + $settings = $this->get_settings(); + + // Parse the arguments, setting our defaults. + $args = wp_parse_args( + $args, + [ + 'index' => 'post', + 'post_type' => [ 'post' ], + 'num' => $settings['num'] ?? 5, + 'num_candidates' => $settings['num_search'] ?? 5000, + 'rescore' => $settings['rescore'] ?? '1', + 'fallback' => $settings['fallback'] ?? '1', + 'score_function' => $settings['score_function'] ?? 'cosine', + ] + ); + + /** + * Filter the arguments before running the search. + * + * @hook classifai_smart_404_exact_knn_search_args + * + * @param array $args Arguments to pass to the search. + * @param string $query Query to search for. + */ + $args = apply_filters( 'classifai_smart_404_exact_knn_search_args', $args, $query ); + + // Ensure our post types are set as an array. + if ( ! is_array( $args['post_type'] ) ) { + $args['post_type'] = [ $args['post_type'] ]; + } + + $integration = new Smart404EPIntegration( $this->get_feature_provider_instance() ); + + // Run our search. Note that this will take our query and generate embeddings for it. + if ( 'no' === $args['rescore'] || false === $args['rescore'] ) { + $results = $integration->exact_knn_search( $query, $args ); + } else { + $results = $integration->search_rescored_by_exact_knn( $query, $args ); + } + + // Ensure we have a good response. + if ( is_wp_error( $results ) ) { + // If we have fallback enabled, return those results. + if ( 'no' !== $args['fallback'] && false !== $args['fallback'] ) { + return $this->fallback_results( $args ); + } + + // translators: %s is the error message. + return new WP_Error( 'error', sprintf( __( 'Error making request: %s.', 'classifai' ), $results->get_error_message() ) ); + } + + // Filter out any results that are below a certain score. + $results = array_filter( + $results, + function ( $result ) use ( $settings ) { + return (float) $result['score'] >= $settings['threshold'] ?? 2.35; + } + ); + + // If we have no results after filtering and fallback is enabled, return those results. + if ( empty( $results ) && ( 'no' !== $args['fallback'] && false !== $args['fallback'] ) ) { + return $this->fallback_results( $args ); + } + + return $results; + } + + /** + * Run a fallback WordPress query for most recent results. + * + * @param array $args Arguments to pass to the search. + * @return array|WP_Error + */ + public function fallback_results( array $args = [] ) { + // Ensure the Feature is enabled and configured before trying to use it. + if ( ! $this->is_configured() || ! $this->is_enabled() ) { + return new WP_Error( 'not_enabled', __( 'Feature is not enabled.', 'classifai' ) ); + } + + $settings = $this->get_settings(); + + // Parse the arguments, setting our defaults. + $args = wp_parse_args( + $args, + [ + 'num' => $settings['num'] ?? 5, + ] + ); + + // Run our query. + $results = new WP_Query( + [ + 'post_type' => 'post', + 'posts_per_page' => $args['num'], + 'post_status' => 'publish', + 'orderby' => 'date', + 'order' => 'DESC', + ] + ); + + // Ensure we have some results. + if ( ! $results->have_posts() ) { + return new WP_Error( 'no_results', __( 'No results found.', 'classifai' ) ); + } + + return $results->posts; + } +} diff --git a/includes/Classifai/Features/Smart404EPIntegration.php b/includes/Classifai/Features/Smart404EPIntegration.php new file mode 100644 index 000000000..d1748fbcf --- /dev/null +++ b/includes/Classifai/Features/Smart404EPIntegration.php @@ -0,0 +1,681 @@ +embeddings_handler = $provider; + $this->es_version = ! $provider ? '7.0' : Elasticsearch::factory()->get_elasticsearch_version(); + $this->tokenizer = ! $this->embeddings_handler ? new Tokenizer( 8191 ) : new Tokenizer( (int) $this->embeddings_handler->get_max_tokens() ); + + if ( $provider ) { + if ( 'openai_embeddings' === $provider::ID ) { + $this->embeddings_meta_key = 'classifai_openai_embeddings'; + } elseif ( 'azure_openai_embeddings' === $provider::ID ) { + $this->embeddings_meta_key = 'classifai_azure_openai_embeddings'; + } + } + } + + /** + * Inintialize the class and register the needed hooks. + */ + public function init() { + // Vector support was added in Elasticsearch 7.0. + if ( ! $this->es_version || version_compare( $this->es_version, '7.0', '<=' ) ) { + return; + } + + add_filter( 'ep_post_mapping', [ $this, 'add_post_vector_field_mapping' ] ); + add_filter( 'ep_prepare_meta_excluded_public_keys', [ $this, 'exclude_vector_meta' ] ); + add_filter( 'ep_post_sync_args_post_prepare_meta', [ $this, 'add_vector_field_to_post_sync' ], 10, 2 ); + add_filter( 'ep_retrieve_the_post', [ $this, 'add_score_field_to_document' ], 10, 2 ); + } + + /** + * Add our vector field mapping to the Elasticsearch post index. + * + * @param array $mapping Current mapping. + * @param bool $quantization Whether to use quantization for the vector field. Default false. + * @return array + */ + public function add_post_vector_field_mapping( array $mapping, bool $quantization = true ): array { + // Don't add the field if it already exists. + if ( isset( $mapping['mappings']['properties']['chunks'] ) ) { + return $mapping; + } + + // Add the default vector field mapping. + $mapping['mappings']['properties']['chunks'] = [ + 'type' => 'nested', + 'properties' => [ + 'vector' => [ + 'type' => 'dense_vector', + 'dims' => (int) $this->embeddings_handler->get_dimensions(), + ], + ], + ]; + + // Add extra vector fields for newer versions of Elasticsearch. + if ( version_compare( $this->es_version, '8.0', '>=' ) ) { + // The index (true or false, default true) and similarity (l2_norm, dot_product or cosine) fields + // were added in 8.0. The similarity field must be set if index is true. + $mapping['mappings']['properties']['chunks']['properties']['vector'] = array_merge( + $mapping['mappings']['properties']['chunks']['properties']['vector'], + [ + 'index' => true, + 'similarity' => 'cosine', + ] + ); + + // The element_type field was added in 8.6. This can be either float (default) or byte. + if ( version_compare( $this->es_version, '8.6', '>=' ) ) { + $mapping['mappings']['properties']['chunks']['properties']['vector']['element_type'] = 'float'; + } + + // The int8_hnsw type was added in 8.12. + if ( $quantization && version_compare( $this->es_version, '8.12', '>=' ) ) { + // This is supposed to result in better performance but slightly less accurate results. + // See https://www.elastic.co/guide/en/elasticsearch/reference/8.13/knn-search.html#knn-search-quantized-example. + // Can test with this on and off and compare results to see what works best. + $mapping['mappings']['properties']['chunks']['properties']['vector']['index_options']['type'] = 'int8_hnsw'; + } + } + + return $mapping; + } + + /** + * Exclude our vector meta from being synced. + * + * @param array $excluded_keys Current excluded keys. + * @return array + */ + public function exclude_vector_meta( array $excluded_keys ): array { + $excluded_keys[] = $this->embeddings_meta_key; + $excluded_keys[] = $this->content_hash_meta_key; + + return $excluded_keys; + } + + /** + * Add the embedding data to the post vector sync args. + * + * @param array $args Current sync args. + * @param int $post_id Post ID being synced. + * @return array + */ + public function add_vector_field_to_post_sync( array $args, int $post_id ): array { + // No need to add vector data if no content exists. + $post = get_post( $post_id ); + if ( empty( $post->post_content ) ) { + return $args; + } + + // Try to use the stored embeddings first if content hasn't changed. + $embeddings = get_post_meta( $post_id, $this->embeddings_meta_key, true ); + $content_hash = get_post_meta( $post_id, $this->content_hash_meta_key, true ); + + // This will include the post title and post content combined. + $content = $this->embeddings_handler->get_normalized_content( $post_id, 'post' ); + + // Add the post slug to our content as well. + $content = $post->post_name . ".\n\n" . $content; + + // If they don't exist or content has changed, make API requests to generate them. + if ( ! $embeddings || md5( $content ) !== $content_hash ) { + $embeddings = []; + + // Chunk the content into smaller pieces. + $content_chunks = $this->embeddings_handler->chunk_content( $content ); + + // Get the embeddings for each chunk. + if ( ! empty( $content_chunks ) ) { + $total_tokens = $this->tokenizer->tokens_in_content( $content ); + + // If we have a lot of tokens, we need to get embeddings for each chunk individually. + if ( (int) $this->embeddings_handler->get_max_tokens() < $total_tokens || ! method_exists( $this->embeddings_handler, 'generate_embeddings' ) ) { + foreach ( $content_chunks as $chunk ) { + $embedding = $this->get_embedding( $chunk ); + + if ( $embedding && ! is_wp_error( $embedding ) ) { + $embeddings[] = $embedding; + } + + // Show an error message if something went wrong. + if ( is_wp_error( $embedding ) ) { + if ( is_indexing_wpcli() ) { + WP_CLI::warning( + sprintf( + /* translators: %d is the post ID; %s is the error message */ + esc_html__( 'Error generating embedding for ID #%1$d: %2$s', 'classifai' ), + $post_id, + $embedding->get_error_message() + ) + ); + } + } + } + } else { + // Otherwise let's get all embeddings in a single request. + $embeddings = $this->get_embeddings( $content_chunks ); + + // Show an error message if something went wrong. + if ( is_wp_error( $embeddings ) ) { + if ( is_indexing_wpcli() ) { + WP_CLI::warning( + sprintf( + /* translators: %d is the post ID; %s is the error message */ + esc_html__( 'Error generating embedding for ID #%1$d: %2$s', 'classifai' ), + $post_id, + $embeddings->get_error_message() + ) + ); + } + + $embeddings = []; + } + } + } + + // Store the embeddings for future use. + if ( ! empty( $embeddings ) ) { + update_post_meta( $post_id, $this->embeddings_meta_key, $embeddings ); + update_post_meta( $post_id, $this->content_hash_meta_key, md5( $content ) ); + } + } + + // If we still don't have embeddings, return early. + if ( ! $embeddings || empty( $embeddings ) ) { + if ( is_indexing_wpcli() ) { + WP_CLI::warning( + sprintf( + /* translators: %d is the post ID */ + esc_html__( 'No embeddings generated for ID #%d', 'classifai' ), + $post_id + ) + ); + } + + return $args; + } + + // Add the embeddings data to the sync args. + $args['chunks'] = []; + + foreach ( $embeddings as $embedding ) { + $args['chunks'][] = [ + 'vector' => array_map( 'floatval', $embedding ), + ]; + } + + return $args; + } + + /** + * Add the score field to the document. + * + * @param array $document Document retrieved from Elasticsearch. + * @param array $hit Raw Elasticsearch hit. + * @return array + */ + public function add_score_field_to_document( array $document, array $hit ): array { + // Only modify if our field is present. + if ( ! isset( $document['chunks'] ) ) { + return $document; + } + + // Add the score to the document if it exists. + if ( isset( $hit['_score'] ) ) { + $document['score'] = $hit['_score']; + } + + return $document; + } + + /** + * Get an embedding from a given text. + * + * @param string $text Text to get the embedding for. + * @param bool $cache Whether to cache the result. Default false. + * @return array|WP_Error + */ + public function get_embedding( string $text, bool $cache = false ) { + // Check to see if we have a stored embedding. + if ( $cache ) { + $key = 'classifai_ep_embedding_' . sanitize_title( $text ); + $query_embedding = wp_cache_get( $key ); + + if ( $query_embedding ) { + return $query_embedding; + } + } + + // Generate the embedding. + $embedding = $this->embeddings_handler->generate_embedding( $text, new Smart404() ); + + if ( is_wp_error( $embedding ) ) { + return $embedding; + } + + // Store the embedding for future use if desired. + if ( $cache ) { + wp_cache_set( $key, $embedding ); + } + + return $embedding; + } + + /** + * Get multiple embeddings at once. + * + * @param array $strings Array of text to get embeddings for. + * @return array|WP_Error + */ + public function get_embeddings( array $strings ) { + // Generate the embeddings. + $embeddings = $this->embeddings_handler->generate_embeddings( $strings, new Smart404() ); + + return $embeddings; + } + + /** + * Run an exact k-nearest neighbor (kNN) search. + * + * @param string $query Query to search for. + * @param array $args { + * Optional. Arguments to pass to the search. + * + * @type string $index Indexable to run the query against. Default post. + * @type array $post_type Post types to return results of. Defaults to just post. + * @type int $num Number of items to return. + * @type string $score_function Function to use for scoring. Default cosine. + * } + * @return array|WP_Error + */ + public function exact_knn_search( string $query, array $args = [] ) { + $query_embedding = $this->get_embedding( $query, true ); + + if ( is_wp_error( $query_embedding ) ) { + return $query_embedding; + } + + // Parse the arguments, setting our defaults. + $args = wp_parse_args( + $args, + [ + 'index' => 'post', + 'post_type' => [ 'post' ], + 'num' => 5, + 'score_function' => 'cosine', + ] + ); + + // Get the ElasticPress indexable. + $indexable = Indexables::factory()->get( $args['index'] ); + + if ( ! $indexable ) { + return new WP_Error( 'invalid_index', esc_html__( 'Invalid indexable provided.', 'classifai' ) ); + } + + // Build our exact kNN query. + $knn_query = [ + 'from' => 0, + 'size' => (int) $args['num'], + 'query' => [ + 'bool' => [ + 'must' => [ + [ + 'terms' => [ + 'post_type.raw' => $args['post_type'], + ], + ], + [ + 'terms' => [ + 'post_status' => [ + 'publish', + ], + ], + ], + [ + 'nested' => [ + 'path' => 'chunks', + 'query' => [ + 'script_score' => [ + 'query' => [ + 'match_all' => (object) [], + ], + 'script' => [ + 'source' => $this->get_script_source( $args['score_function'] ), + 'params' => [ + 'query_vector' => array_map( 'floatval', $query_embedding ), + ], + ], + ], + ], + ], + ], + ], + ], + ], + ]; + + // Run the query using the ElasticPress indexable. + $res = $indexable->query_es( $knn_query, [] ); + + if ( false === $res || ! isset( $res['documents'] ) ) { + return new WP_Error( 'es_error', esc_html__( 'Unable to query Elasticsearch', 'classifai' ) ); + } + + return $res['documents']; + } + + /** + * Runs a normal ES search query then rescores results with an exact kNN search. + * + * @param string $query Query to search for. + * @param array $args { + * Optional. Arguments to pass to the search. + * + * @type string $index Indexable to run the query against. Default post. + * @type array $post_type Post types to return results of. Defaults to just post. + * @type int $num Number of items to return. + * @type int $num_candidates Number of candidates to search. Larger numbers give better results but are slower. + * @type string $score_function Function to use for scoring. Default cosine. + * } + * @return array|WP_Error + */ + public function search_rescored_by_exact_knn( string $query, array $args = [] ) { + $query_embedding = $this->get_embedding( $query, true ); + + if ( is_wp_error( $query_embedding ) ) { + return $query_embedding; + } + + // Parse the arguments, setting our defaults. + $args = wp_parse_args( + $args, + [ + 'index' => 'post', + 'post_type' => [ 'post' ], + 'num' => 5, + 'num_candidates' => 50, + 'score_function' => 'cosine', + ] + ); + + // Get the ElasticPress indexable. + $indexable = Indexables::factory()->get( $args['index'] ); + + if ( ! $indexable ) { + return new WP_Error( 'invalid_index', esc_html__( 'Invalid indexable provided.', 'classifai' ) ); + } + + // Build our default search query. + $default_es_query = [ + 'from' => 0, + 'size' => (int) $args['num_candidates'], + ]; + + // Expand our default search query depending on the indexable type. + switch ( $args['index'] ) { + case 'post': + $default_query = $this->default_search_post_query( $query, $args['post_type'], (int) $args['num_candidates'], $indexable ); + + if ( isset( $default_query['query'] ) ) { + $default_es_query['query'] = $default_query['query']; + + // Add the post_name field to the multi_match fields. + for ( $key = 0; $key < 3; $key++ ) { + if ( isset( $default_es_query['query']['function_score']['query']['bool']['should'][ $key ]['multi_match']['fields'] ) ) { + $default_es_query['query']['function_score']['query']['bool']['should'][ $key ]['multi_match']['fields'] = array_merge( $default_es_query['query']['function_score']['query']['bool']['should'][ $key ]['multi_match']['fields'], [ 'post_name' ] ); + } + } + + if ( isset( $default_query['post_filter'] ) ) { + $default_es_query['post_filter'] = $default_query['post_filter']; + } + } + + break; + } + + // Run the query using the ElasticPress indexable. + $default_res = $indexable->query_es( $default_es_query, [] ); + + if ( false === $default_res || ! isset( $default_res['documents'] ) ) { + return new WP_Error( 'es_error', esc_html__( 'Unable to query Elasticsearch', 'classifai' ) ); + } + + // Get the post IDs from the default search. + $post_ids = array_column( $default_res['documents'], 'post_id' ); + + if ( empty( $post_ids ) ) { + return new WP_Error( 'es_error', esc_html__( 'No post IDs found', 'classifai' ) ); + } + + // Build our exact kNN query. + $knn_query = [ + 'from' => 0, + 'size' => (int) $args['num'], + 'query' => [ + 'bool' => [ + 'must' => [ + [ + 'bool' => [ + 'must' => [ + 'terms' => [ + 'post_id' => $post_ids, + ], + ], + ], + ], + [ + 'nested' => [ + 'path' => 'chunks', + 'query' => [ + 'script_score' => [ + 'query' => [ + 'match_all' => (object) [], + ], + 'script' => [ + 'source' => $this->get_script_source( $args['score_function'] ), + 'params' => [ + 'query_vector' => array_map( 'floatval', $query_embedding ), + ], + ], + ], + ], + ], + ], + ], + ], + ], + ]; + + // Run the query using the ElasticPress indexable. + $res = $indexable->query_es( $knn_query, [] ); + + if ( false === $res || ! isset( $res['documents'] ) ) { + return new WP_Error( 'es_error', esc_html__( 'Unable to query Elasticsearch', 'classifai' ) ); + } + + return $res['documents']; + } + + /** + * Build a default search post query. + * + * @param string $query Query to search for. + * @param array $post_type Post types to return results of. + * @param int $num Number of items to return. + * @param Indexable $indexable Indexable to run the query against. + * @return array + */ + private function default_search_post_query( string $query, array $post_type, int $num, Indexable $indexable ): array { + $search_args = [ + 's' => $query, + 'post_type' => ! empty( $post_type ) ? $post_type : 'any', + 'posts_per_page' => (int) $num, + ]; + + $search_query = new \WP_Query(); + + $search_query->init(); + $search_query->query = wp_parse_args( $search_args ); + $search_query->query_vars = $search_query->query; + + $default_query = $indexable->format_args( $search_query->query_vars, $search_query ); + + return $default_query; + } + + /** + * Set the script source based on the desired score function. + * + * @param string $type Type of score function to use. Default "cosineSimilarity". + * @return string + */ + private function get_script_source( string $type = 'cosine' ): string { + $source = ''; + + switch ( $type ) { + case 'cosine': + case 'cosine_similarity': + $source = 'cosineSimilarity(params.query_vector, "chunks.vector") + 1.0'; + break; + + case 'dot': + case 'dot_product': + $source = 'double value = dotProduct(params.query_vector, "chunks.vector"); return sigmoid(1, Math.E, -value);'; + break; + + case 'l1_norm': + case 'l1norm': + $source = '1 / (1 + l1norm(params.query_vector, "chunks.vector"))'; + break; + + case 'l2_norm': + case 'l2norm': + $source = '1 / (1 + l2norm(params.query_vector, "chunks.vector"))'; + break; + } + + return $source; + } + + /** + * Convert Elasticsearch results to WP_Post objects. + * + * @param array $results Document results from Elasticsearch. + * @return array + */ + public function convert_es_results_to_post_objects( array $results ): array { + $new_posts = []; + + // Turn each ES result into a WP_Post object. + // Copied from ElasticPress\Indexable\Post\QueryIntegration::format_hits_as_posts. + foreach ( $results as $post_array ) { + // Don't convert if not needed. + if ( is_a( $post_array, 'WP_Post' ) ) { + $new_posts[] = $post_array; + continue; + } + + $post = new \stdClass(); + + $post->ID = $post_array['post_id']; + $post->site_id = get_current_blog_id(); + + if ( ! empty( $post_array['site_id'] ) ) { + $post->site_id = $post_array['site_id']; + } + + $post_return_args = [ + 'post_type', + 'post_author', + 'post_name', + 'post_status', + 'post_title', + 'post_content', + 'post_excerpt', + 'post_date', + 'post_date_gmt', + 'permalink', + ]; + + foreach ( $post_return_args as $key ) { + if ( 'post_author' === $key ) { + $post->$key = $post_array[ $key ]['id']; + } elseif ( isset( $post_array[ $key ] ) ) { + $post->$key = $post_array[ $key ]; + } + } + + $post->elasticsearch = true; + + if ( $post ) { + $new_posts[] = $post; + } + } + + return $new_posts; + } +} diff --git a/includes/Classifai/Helpers.php b/includes/Classifai/Helpers.php index 5968fe69b..494d64b96 100644 --- a/includes/Classifai/Helpers.php +++ b/includes/Classifai/Helpers.php @@ -3,6 +3,8 @@ namespace Classifai; use Classifai\Features\Classification; +use Classifai\Features\Smart404; +use Classifai\Features\Smart404EPIntegration; use Classifai\Providers\Provider; use Classifai\Admin\UserProfile; use Classifai\Providers\Watson\NLU; @@ -661,3 +663,94 @@ function get_classification_mode(): string { return $value; } + +/** + * Get all parts from the current URL. + * + * For instance, if the URL is `https://example.com/this/is/a/test/`, + * this function will return: `[ 'this', 'is', 'a', 'test' ]`. + * + * @return array + */ +function get_url_slugs(): array { + global $wp; + + $parts = explode( '/', $wp->request ); + + return array_filter( $parts ); +} + +/** + * Get the last part from the current URL. + * + * For instance, if the URL is `https://example.com/this/is/a/test`, + * this function will return: 'test'. + * + * @return string + */ +function get_last_url_slug(): string { + $parts = get_url_slugs(); + + return trim( end( $parts ) ); +} + +/** + * Check if ElasticPress is installed. + * + * @return bool + */ +function is_elasticpress_installed(): bool { + return class_exists( '\\ElasticPress\\Feature' ); +} + +/** + * Get the Smart 404 results. + * + * @param array $args Arguments to pass to the search. + * @return array + */ +function get_smart_404_results( array $args = [] ): array { + // Run our query. + $results = ( new Smart404() )->exact_knn_search( get_last_url_slug(), $args ); + + // Ensure the query ran successfully. + if ( is_wp_error( $results ) ) { + return []; + } + + // Convert the results to normal WP_Post objects. + $results = ( new Smart404EPIntegration() )->convert_es_results_to_post_objects( $results ); + + return $results; +} + +/** + * Render the Smart 404 results. + * + * @param array $args Arguments to pass to the search. + */ +function render_smart_404_results( array $args = [] ) { + // Get the results. + $results = get_smart_404_results( $args ); + + // Handle situation where we don't have results. + if ( empty( $results ) ) { + return; + } + + // Iterate through each result and render it. + echo '