diff --git a/.kontinuous/env/dev/templates/export.configmap.yaml b/.kontinuous/env/dev/templates/export.configmap.yaml index f754b09f4..f47d5ade8 100644 --- a/.kontinuous/env/dev/templates/export.configmap.yaml +++ b/.kontinuous/env/dev/templates/export.configmap.yaml @@ -4,10 +4,8 @@ metadata: name: export-elasticsearch data: HASURA_GRAPHQL_ENDPOINT: "http://hasura/v1/graphql" - NLP_URL: "https://serving-ml-preprod.ovh.fabrique.social.gouv.fr" NODE_ENV: "production" LOG_LEVEL: "info" - NLP_PREPROD_DISABLE: "true" BUCKET_DEFAULT_FOLDER: "default" BUCKET_DRAFT_FOLDER: "draft" BUCKET_PREVIEW_FOLDER: "preview" @@ -20,3 +18,4 @@ data: ELASTICSEARCH_INDEX_PREPROD: "cdtn-dev-v2" ELASTICSEARCH_INDEX_PROD: "cdtn-dev-v2" MATTERMOST_CHANNEL_EXPORT: "s-cdtn-administration-veille-dev" + DISABLE_LIMIT_EXPORT: "true" diff --git a/.kontinuous/env/preprod/templates/export.configmap.yaml b/.kontinuous/env/preprod/templates/export.configmap.yaml index 8412c38c6..c19f125e9 100644 --- a/.kontinuous/env/preprod/templates/export.configmap.yaml +++ b/.kontinuous/env/preprod/templates/export.configmap.yaml @@ -4,7 +4,6 @@ metadata: name: export-elasticsearch data: HASURA_GRAPHQL_ENDPOINT: "http://hasura/v1/graphql" - NLP_URL: "https://serving-ml.fabrique.social.gouv.fr" NODE_ENV: "production" LOG_LEVEL: "info" BUCKET_DEFAULT_FOLDER: "default" @@ -19,3 +18,4 @@ data: ELASTICSEARCH_INDEX_PREPROD: "cdtn-main-v2" ELASTICSEARCH_INDEX_PROD: "cdtn-main-v2" MATTERMOST_CHANNEL_EXPORT: "s-cdtn-administration-veille-preprod" + DISABLE_LIMIT_EXPORT: "true" diff --git a/.kontinuous/env/prod/templates/export.configmap.yaml b/.kontinuous/env/prod/templates/export.configmap.yaml index a128ecc57..d3ec21772 100644 --- a/.kontinuous/env/prod/templates/export.configmap.yaml +++ b/.kontinuous/env/prod/templates/export.configmap.yaml @@ -4,7 +4,6 @@ metadata: name: export-elasticsearch data: HASURA_GRAPHQL_ENDPOINT: "http://hasura/v1/graphql" - NLP_URL: "https://serving-ml.fabrique.social.gouv.fr" NODE_ENV: "production" LOG_LEVEL: "info" BUCKET_DEFAULT_FOLDER: "default" @@ -18,4 +17,4 @@ data: AGREEMENTS_DESTINATION_NAME: "index.json" ELASTICSEARCH_INDEX_PREPROD: "cdtn-preprod-v2" ELASTICSEARCH_INDEX_PROD: "cdtn-prod-v2" - MATTERMOST_CHANNEL_EXPORT: "s-cdtn-administration-veille" + MATTERMOST_CHANNEL_EXPORT: "s-cdtn-administration-veille" \ No newline at end of file diff --git a/README.md b/README.md index 313183034..e7ccbe88b 100644 --- a/README.md +++ b/README.md @@ -157,9 +157,6 @@ DISABLE_LIMIT_EXPORT=true DISABLE_AGREEMENTS=true DISABLE_SITEMAP=true HASURA_GR - `DISABLE_COPY` is used to disable copy between two containers - `DISABLE_SITEMAP` is used to disable copy of the sitemap - `DISABLE_AGREEMENTS` is used to disable copy of the agreements -- `NLP_URL` could be set by `https://serving-ml-preprod.ovh.fabrique.social.gouv.fr`, by default it is `undefined` - -> **Note**: You can remove `NLP_URL` from your environment variables if you don't want to use the NLP service and gain time during the process of ingester elasticsearch. #### 6. Run the export elasticsearch @@ -178,7 +175,7 @@ yarn workspace frontend dev #### On client ```sh -NLP_URL=https://serving-ml-preprod.ovh.fabrique.social.gouv.fr yarn workspace @cdt/frontend dev +yarn workspace @cdt/frontend dev ``` 1. Go to `http://localhost:3001/` @@ -292,22 +289,6 @@ Cela permet de lier l'index elasticsearch automatiquement entre les deux branche L'export des données se fait depuis l'admin dans la section `Contenus > Mise à jour`. Il faut ensuite cliquer sur le bouton `Mettre à jour la pre-production`. -> Note: Le glossary (injection des tooltips) et le NLP (vectorisation des données) sont par défaut désactivé en dev. - -#### Activer le glossary et le NLP - -Il faut commencer par donner les ressources nécessaires au processus dans l'environnement de dev : - -- Ouvrir le fichier `.kontinous/env/dev/values.yaml` -- Appliquer ce que les commentaires indiquent pour les ressources sur hasura et export - -L'export des données se fait depuis l'admin dans la section `Contenus > Mise à jour`. Il faut ensuite cliquer sur le bouton `Mettre à jour la production`. - -/!\ /!\ /!\ ATTENTION /!\ /!\ /!\ : Bien penser à remettre les lignes en commentaire avant de merger dans master ! - -> Pourquoi changer les ressources ? -> L'export avec glossary et NLP est un processus qui demande beaucoup de RAM/CPU. Afin de ne pas surcharger le cluster de dev, on ne va pas demander ces ressources car l'export est peu utilisé pour les tests. Il n'existe aucun mécanisme sur la CI à l'heure actuelle pour permettre de faire le switch autrement. - ### Limitations connues - Les fichiers du site sont stockés au même endroit pour l'ensemble des branches. Si on ajoute/modifie/supprime un fichier, cela sera également le cas sur l'ensemble des branches diff --git a/docker-compose.yml b/docker-compose.yml index 55156ca97..0a43f72cb 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -129,8 +129,8 @@ services: HASURA_GRAPHQL_ENDPOINT: "http://hasura:8080/v1/graphql" DISABLE_COPY: "true" DISABLE_SITEMAP: "true" + DISABLE_LIMIT_EXPORT: "true" DISABLE_AGREEMENTS: "true" - NLP_URL: "https://serving-ml-preprod.ovh.fabrique.social.gouv.fr" ELASTICSEARCH_INDEX_PREPROD: "cdtn-preprod-v1" ELASTICSEARCH_INDEX_PROD: "cdtn-v1" ELASTICSEARCH_URL_PREPROD: "http://elasticsearch:9200" diff --git a/shared/elasticsearch/package.json b/shared/elasticsearch/package.json index 03b429bff..56073539b 100644 --- a/shared/elasticsearch/package.json +++ b/shared/elasticsearch/package.json @@ -7,9 +7,6 @@ "@babel/plugin-transform-modules-commonjs" ] }, - "dependencies": { - "got": "^11.8.2" - }, "license": "Apache-2.0", "main": "src/index.js", "publishConfig": { diff --git a/shared/elasticsearch/src/index.d.ts b/shared/elasticsearch/src/index.d.ts index 123ea90e7..144120f60 100644 --- a/shared/elasticsearch/src/index.d.ts +++ b/shared/elasticsearch/src/index.d.ts @@ -4,7 +4,5 @@ export const documentMapping: any; export const DOCUMENTS: string; export const indexDocumentsBatched: any; export const SUGGESTIONS: string; -export const vectorizeDocument: any; export const version: any; export const suggestionMapping: any; -export const vectorizeQuery: any; diff --git a/shared/elasticsearch/src/mapping/document.mapping.js b/shared/elasticsearch/src/mapping/document.mapping.js index b3565628d..69bc59ab2 100644 --- a/shared/elasticsearch/src/mapping/document.mapping.js +++ b/shared/elasticsearch/src/mapping/document.mapping.js @@ -236,14 +236,11 @@ exports.documentMapping = { type: "text", }, - title_vector: { - dims: 512, - type: "dense_vector", - }, // The source URL url: { type: "keyword", }, + // used in prequalifieds variants: { type: "text", diff --git a/shared/elasticsearch/src/vectorizer/index.js b/shared/elasticsearch/src/vectorizer/index.js index 35aa353bc..385f97903 100644 --- a/shared/elasticsearch/src/vectorizer/index.js +++ b/shared/elasticsearch/src/vectorizer/index.js @@ -1,14 +1,5 @@ -// vectorizer is imported by code-du-travail-api which is using CommonJS, and throwing an exception -// when requiring code-du-travail-data ES module, thus we keep using CommonJS import here -const got = require("got"); const { stopwords: semantic_stopwords } = require("../dataset/stop_words"); -// URL of the TF serve deployment -const NLP_URL = - process.env.NLP_URL || "https://serving-ml.fabrique.social.gouv.fr"; -console.log("NLP URL:", NLP_URL); -const tfServeURL = NLP_URL + "/v1/models/sentqam:predict"; - function stripAccents(text) { // strip accents return text.normalize("NFD").replace(/[\u0300-\u036f]/g, ""); @@ -16,8 +7,6 @@ function stripAccents(text) { const stopWords = new Set(semantic_stopwords.map(stripAccents)); -const cache = new Map(); - function preprocess(text) { const stripped = stripAccents(text); @@ -31,48 +20,4 @@ function preprocess(text) { return noStopWords.join(" "); } -async function callTFServe(json) { - const response = await got.post(tfServeURL, { - cache, - json, - responseType: "json", - retry: { - limit: 15, - methods: ["POST"], - }, - }); - return response.body["outputs"]; -} - -async function vectorizeDocument(title, content) { - if (title == undefined || title == "") { - throw new Error("Cannot vectorize document with empty title."); - } - - const input = [preprocess(title)]; - const context = content ? [preprocess(content)] : ""; - - const body = { - inputs: { context, input }, - signature_name: "response_encoder", - }; - const vectors = await callTFServe(body); - - return vectors[0]; -} - -async function vectorizeQuery(query) { - if (!query) { - throw new Error("Cannot vectorize empty query."); - } - - const inputs = [preprocess(query)]; - const body = { - inputs, - signature_name: "question_encoder", - }; - const vectors = await callTFServe(body); - return vectors[0]; -} - -module.exports = { preprocess, vectorizeDocument, vectorizeQuery }; +module.exports = { preprocess }; diff --git a/shared/elasticsearch/src/vectorizer/index.test.js b/shared/elasticsearch/src/vectorizer/index.test.js index 2fb228e64..5c97106f3 100644 --- a/shared/elasticsearch/src/vectorizer/index.test.js +++ b/shared/elasticsearch/src/vectorizer/index.test.js @@ -1,44 +1,4 @@ -const { vectorizeDocument, vectorizeQuery, preprocess } = require("./index"); - -const timeout = 10000; - -test( - "Should vectorize document", - async () => { - const vector1 = await vectorizeDocument("titre", "contenu"); - expect(vector1).toBeDefined(); - // FIXME Should return the same result but don't. See with remi and fabien. - // expect(vector1).toMatchSnapshot(); - - // preprocessing should make those embeddings equal - // FIXME Should return the same result but don't. See with remi and fabien. - // const vector2 = await vectorizeDocument("le titre", "et le contènu"); - // expect(vector2).toEqual(vector1); - }, - timeout -); - -test( - "Should vectorize query", - async () => { - // FIXME Résultat aléatoire, voir pourquoi on n'obtient pas toujours la même réponse - // const vector1 = await vectorizeQuery("requete"); - // expect(vector1).toMatchSnapshot(); - // const vector2 = await vectorizeQuery("la requête"); - // expect(vector2).toEqual(vector1); - }, - timeout -); - -test( - "Should fail when no content passed", - async () => { - await expect(vectorizeQuery()).rejects.toThrow( - new Error("Cannot vectorize empty query.") - ); - }, - timeout -); +const { preprocess } = require("./index"); test("Should preprocess text", async () => { expect(preprocess("à la nôtre")).toEqual(""); diff --git a/shared/types/src/elastic/tools.ts b/shared/types/src/elastic/tools.ts index bda29d276..b835aa537 100644 --- a/shared/types/src/elastic/tools.ts +++ b/shared/types/src/elastic/tools.ts @@ -17,7 +17,6 @@ export type Tool = { source: string; text: string; title: string; - title_vector: number[]; _id: string; displayTool?: boolean; }; diff --git a/targets/export-elasticsearch/src/ingester/ingest.ts b/targets/export-elasticsearch/src/ingester/ingest.ts index d48fd6bd0..1c9d2ea3d 100644 --- a/targets/export-elasticsearch/src/ingester/ingest.ts +++ b/targets/export-elasticsearch/src/ingester/ingest.ts @@ -6,72 +6,32 @@ import { DOCUMENTS, indexDocumentsBatched, SUGGESTIONS, - vectorizeDocument, version, } from "@socialgouv/cdtn-elasticsearch"; import { logger } from "@shared/utils"; -import { SOURCES } from "@socialgouv/cdtn-sources"; -import pMap from "p-map"; import { cdtnDocumentsGen } from "./cdtnDocuments"; import { context } from "./context"; import { populateSuggestions } from "./suggestion"; -async function addVector(data: any) { - const NLP_URL = context.get("nlpUrl"); - if (NLP_URL) { - if (!data.title) { - logger.error(`No title for document ${data.source} / ${data.slug}`); - } - const title = data.title || "sans titre"; - await vectorizeDocument(title, data.text) - .then((title_vector: any) => { - if (title_vector.message) { - throw new Error(`error fetching message ${data.title}`); - } - data.title_vector = title_vector; - }) - .catch((err: any) => { - throw new Error( - `Vectorization failed: ${data.id} (${data.title} - ${err.retryCount} retries)` - ); - }); - } - - return Promise.resolve(data); -} - -// these sources do not need NLP vectorization -const excludeSources = [ - SOURCES.CDT, - SOURCES.GLOSSARY, - SOURCES.PREQUALIFIED, - SOURCES.HIGHLIGHTS, - SOURCES.SHEET_MT_PAGE, - SOURCES.VERSIONS, -]; - export async function ingest( cdtnAdminEndpoint: string | undefined, cdtnAdminEndpointSecret: string | undefined, esUrl: string | undefined, esTokenIngest: string | undefined, esIndexPrefix: string | undefined, - nlpUrl: string | undefined, suggestIndexName: string | undefined, bufferSize: number | undefined, suggestFile: string | undefined, isProd = false ) { context.provide(); - process.env.NLP_URL = nlpUrl; //pour setter la variable d'environment du package elasticsearch... await runIngester( cdtnAdminEndpoint, cdtnAdminEndpointSecret, esUrl, esTokenIngest, esIndexPrefix, - nlpUrl, suggestIndexName, bufferSize, suggestFile, @@ -85,7 +45,6 @@ async function runIngester( esUrl: string | undefined, esTokenIngest: string | undefined, esIndexPrefix: string | undefined, - nlpUrl: string | undefined, suggestIndexName: string | undefined, bufferSize: number | undefined, suggestFile: string | undefined, @@ -119,16 +78,9 @@ async function runIngester( context.set("suggestIndexName", suggestIndexName); context.set("bufferSize", bufferSize); context.set("suggestFile", suggestFile); - context.set("nlpUrl", nlpUrl); const ts = Date.now(); logger.info(`Using cdtn elasticsearch ${ELASTICSEARCH_URL}`); - if (nlpUrl) { - logger.info(`Using NLP service to retrieve tf vectors on ${nlpUrl}`); - } else { - logger.info(`NLP_URL not defined, semantic search will be disabled.`); - } - await version({ client }); logger.info(`Creating index ${DOCUMENT_INDEX_NAME}-${ts}`); @@ -142,18 +94,9 @@ async function runIngester( const updateDocs = async (source: string, documents: unknown[]) => { logger.info(`› ${source}... ${documents.length} items`); - let docs = documents; - - // add NLP vectors - if (!(excludeSources as string[]).includes(source)) { - docs = await pMap(documents, addVector, { - concurrency: 5, - }); - } - await indexDocumentsBatched({ client, - documents: docs, + documents, indexName: `${DOCUMENT_INDEX_NAME}-${ts}`, size: 800, }); diff --git a/targets/export-elasticsearch/src/workers/ingester-preprod.ts b/targets/export-elasticsearch/src/workers/ingester-preprod.ts index 64da1e569..995034a46 100644 --- a/targets/export-elasticsearch/src/workers/ingester-preprod.ts +++ b/targets/export-elasticsearch/src/workers/ingester-preprod.ts @@ -12,7 +12,6 @@ const ingester = async (): Promise => { process.env.BRANCH_NAME_SLUG ? `cdtn-${process.env.BRANCH_NAME_SLUG}` : process.env.ELASTICSEARCH_INDEX_PREPROD, - process.env.NLP_PREPROD_DISABLE ? undefined : process.env.NLP_URL, undefined, undefined, undefined diff --git a/targets/export-elasticsearch/src/workers/ingester-prod.ts b/targets/export-elasticsearch/src/workers/ingester-prod.ts index e4624b88f..f47b383b7 100644 --- a/targets/export-elasticsearch/src/workers/ingester-prod.ts +++ b/targets/export-elasticsearch/src/workers/ingester-prod.ts @@ -12,7 +12,6 @@ const ingester = async (): Promise => { process.env.BRANCH_NAME_SLUG ? `cdtn-${process.env.BRANCH_NAME_SLUG}` : process.env.ELASTICSEARCH_INDEX_PROD, - process.env.NLP_URL, undefined, undefined, undefined, diff --git a/yarn.lock b/yarn.lock index c98c8fa76..380d063d4 100644 --- a/yarn.lock +++ b/yarn.lock @@ -5486,7 +5486,6 @@ __metadata: "@babel/core": ^7.15.5 "@babel/plugin-transform-modules-commonjs": ^7.15.4 "@shared/eslint-config": "workspace:^" - got: ^11.8.2 jest: ^27.1.1 lint-staged: ^12.0.0 languageName: unknown @@ -12657,25 +12656,6 @@ __metadata: languageName: node linkType: hard -"got@npm:^11.8.2": - version: 11.8.6 - resolution: "got@npm:11.8.6" - dependencies: - "@sindresorhus/is": ^4.0.0 - "@szmarczak/http-timer": ^4.0.5 - "@types/cacheable-request": ^6.0.1 - "@types/responselike": ^1.0.0 - cacheable-lookup: ^5.0.3 - cacheable-request: ^7.0.2 - decompress-response: ^6.0.0 - http2-wrapper: ^1.0.0-beta.5.2 - lowercase-keys: ^2.0.0 - p-cancelable: ^2.0.0 - responselike: ^2.0.0 - checksum: bbc783578a8d5030c8164ef7f57ce41b5ad7db2ed13371e1944bef157eeca5a7475530e07c0aaa71610d7085474d0d96222c9f4268d41db333a17e39b463f45d - languageName: node - linkType: hard - "got@npm:^9.6.0": version: 9.6.0 resolution: "got@npm:9.6.0"