diff --git a/.github/workflows/push-server.yml b/.github/workflows/push-server.yml
index 61ec5152dc..92b22a69c7 100644
--- a/.github/workflows/push-server.yml
+++ b/.github/workflows/push-server.yml
@@ -450,3 +450,147 @@ jobs:
           push: true
           tags: ${{ steps.meta.outputs.tags }}
           labels: ${{ steps.meta.outputs.labels }}
+
+  word-id-cronjob:
+    name: Push Word ID cronjob
+    runs-on: ${{ matrix.runner }}
+    strategy:
+      matrix:
+        runner: [blacksmith-8vcpu-ubuntu-2204]
+        platform: [linux/amd64]
+        exclude:
+          - runner: blacksmith-8vcpu-ubuntu-2204
+            platform: linux/arm64
+          - runner: blacksmith-8vcpu-ubuntu-2204-arm
+            platform: linux/amd64
+    steps:
+      - name: Checkout the repo
+        uses: actions/checkout@v4
+
+      - name: Setup buildx
+        uses: docker/setup-buildx-action@v3
+
+      - name: Login to Docker Hub
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.DOCKER_USERNAME }}
+          password: ${{ secrets.DOCKER_PASSWORD }}
+
+      - name: Docker meta
+        id: meta
+        uses: docker/metadata-action@v5
+        with:
+          # list of Docker images to use as base name for tags
+          images: |
+            trieve/word-id-cronjob
+          tags: |
+            type=raw,latest
+            type=sha
+
+      - name: Build and push Docker image
+        uses: docker/build-push-action@v5
+        with:
+          platforms: ${{ matrix.platform }}
+          cache-from: type=registry,ref=trieve/buildcache:word-id-cronjob-${{matrix.runner}}
+          cache-to: type=registry,ref=trieve/buildcache:word-id-cronjob-${{matrix.runner}},mode=max
+          context: server/
+          file: ./server/Dockerfile.word-id-cronjob
+          push: true
+          tags: ${{ steps.meta.outputs.tags }}
+          labels: ${{ steps.meta.outputs.labels }}
+
+  word-worker:
+    name: Push Word Worker
+    runs-on: ${{ matrix.runner }}
+    strategy:
+      matrix:
+        runner: [blacksmith-8vcpu-ubuntu-2204]
+        platform: [linux/amd64]
+        exclude:
+          - runner: blacksmith-8vcpu-ubuntu-2204
+            platform: linux/arm64
+          - runner: blacksmith-8vcpu-ubuntu-2204-arm
+            platform: linux/amd64
+    steps:
+      - name: Checkout the repo
+        uses: actions/checkout@v4
+
+      - name: Setup buildx
+        uses: docker/setup-buildx-action@v3
+
+      - name: Login to Docker Hub
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.DOCKER_USERNAME }}
+          password: ${{ secrets.DOCKER_PASSWORD }}
+
+      - name: Docker meta
+        id: meta
+        uses: docker/metadata-action@v5
+        with:
+          # list of Docker images to use as base name for tags
+          images: |
+            trieve/word-worker
+          tags: |
+            type=raw,latest
+            type=sha
+
+      - name: Build and push Docker image
+        uses: docker/build-push-action@v5
+        with:
+          platforms: ${{ matrix.platform }}
+          cache-from: type=registry,ref=trieve/buildcache:word-worker-${{matrix.runner}}
+          cache-to: type=registry,ref=trieve/buildcache:word-worker-${{matrix.runner}},mode=max
+          context: server/
+          file: ./server/Dockerfile.word-worker
+          push: true
+          tags: ${{ steps.meta.outputs.tags }}
+          labels: ${{ steps.meta.outputs.labels }}
+
+  bktree-worker:
+    name: Push BK-Tree Worker
+    runs-on: ${{ matrix.runner }}
+    strategy:
+      matrix:
+        runner: [blacksmith-8vcpu-ubuntu-2204]
+        platform: [linux/amd64]
+        exclude:
+          - runner: blacksmith-8vcpu-ubuntu-2204
+            platform: linux/arm64
+          - runner: blacksmith-8vcpu-ubuntu-2204-arm
+            platform: linux/amd64
+    steps:
+      - name: Checkout the repo
+        uses: actions/checkout@v4
+
+      - name: Setup buildx
+        uses: docker/setup-buildx-action@v3
+
+      - name: Login to Docker Hub
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.DOCKER_USERNAME }}
+          password: ${{ secrets.DOCKER_PASSWORD }}
+
+      - name: Docker meta
+        id: meta
+        uses: docker/metadata-action@v5
+        with:
+          # list of Docker images to use as base name for tags
+          images: |
+            trieve/bktree-worker
+          tags: |
+            type=raw,latest
+            type=sha
+
+      - name: Build and push Docker image
+        uses: docker/build-push-action@v5
+        with:
+          platforms: ${{ matrix.platform }}
+          cache-from: type=registry,ref=trieve/buildcache:bktree-worker-${{matrix.runner}}
+          cache-to: type=registry,ref=trieve/buildcache:bktree-worker-${{matrix.runner}},mode=max
+          context: server/
+          file: ./server/Dockerfile.bktree-worker
+          push: true
+          tags: ${{ steps.meta.outputs.tags }}
+          labels: ${{ steps.meta.outputs.labels }}
diff --git a/frontends/search/src/components/GroupPage.tsx b/frontends/search/src/components/GroupPage.tsx
index 089f976515..e254b8b447 100644
--- a/frontends/search/src/components/GroupPage.tsx
+++ b/frontends/search/src/components/GroupPage.tsx
@@ -263,6 +263,18 @@ export const GroupPage = (props: GroupPageProps) => {
             slim_chunks: search.debounced.slimChunks,
             page_size: search.debounced.pageSize,
             get_total_pages: search.debounced.getTotalPages,
+            typo_options: {
+              correct_typos: search.debounced.correctTypos,
+              one_typo_word_range: {
+                min: search.debounced.oneTypoWordRangeMin,
+                max: search.debounced.oneTypoWordRangeMax,
+              },
+              two_typo_word_range: {
+                min: search.debounced.twoTypoWordRangeMin,
+                max: search.debounced.twoTypoWordRangeMax,
+              },
+              disable_on_words: search.debounced.disableOnWords,
+            },
             highlight_options: {
               highlight_results: search.debounced.highlightResults,
               highlight_strategy: search.debounced.highlightStrategy,
diff --git a/frontends/search/src/components/ResultsPage.tsx b/frontends/search/src/components/ResultsPage.tsx
index 5b4f3b166d..438ae3d098 100644
--- a/frontends/search/src/components/ResultsPage.tsx
+++ b/frontends/search/src/components/ResultsPage.tsx
@@ -282,6 +282,18 @@ const ResultsPage = (props: ResultsPageProps) => {
         slim_chunks: props.search.debounced.slimChunks ?? false,
         page_size: props.search.debounced.pageSize ?? 10,
         get_total_pages: props.search.debounced.getTotalPages ?? false,
+        typo_options: {
+          correct_typos: props.search.debounced.correctTypos,
+          one_typo_word_range: {
+            min: props.search.debounced.oneTypoWordRangeMin,
+            max: props.search.debounced.oneTypoWordRangeMax,
+          },
+          two_typo_word_range: {
+            min: props.search.debounced.twoTypoWordRangeMin,
+            max: props.search.debounced.twoTypoWordRangeMax,
+          },
+          disable_on_word: props.search.debounced.disableOnWords,
+        },
         highlight_options: {
           highlight_results: props.search.debounced.highlightResults ?? true,
           highlight_strategy:
diff --git a/frontends/search/src/components/SearchForm.tsx b/frontends/search/src/components/SearchForm.tsx
index 25be35cf10..989c60952d 100644
--- a/frontends/search/src/components/SearchForm.tsx
+++ b/frontends/search/src/components/SearchForm.tsx
@@ -1051,6 +1051,13 @@ const SearchForm = (props: {
                                   pageSize: 10,
                                   getTotalPages: false,
                                   highlightStrategy: "exactmatch",
+                                  correctTypos: false,
+                                  oneTypoWordRangeMin: 5,
+                                  oneTypoWordRangeMax: 8,
+                                  twoTypoWordRangeMin: 8,
+                                  twoTypoWordRangeMax: null,
+                                  disableOnWords: [],
+                                  typoTolerance: false,
                                   highlightResults: true,
                                   highlightDelimiters: ["?", ".", "!"],
                                   highlightMaxLength: 8,
@@ -1195,7 +1202,7 @@ const SearchForm = (props: {
                           />
                         </div>
                         <div class="flex items-center justify-between space-x-2 p-1">
-                          <label>Remove Stop Words</label>
+                          <label>Remove Stop Words:</label>
                           <input
                             class="h-4 w-4"
                             type="checkbox"
@@ -1210,6 +1217,132 @@ const SearchForm = (props: {
                             }}
                           />
                         </div>
+                        <div class="flex items-center justify-between space-x-2 p-1">
+                          <label>Typo Tolerance (Latency Penalty):</label>
+                          <input
+                            class="h-4 w-4"
+                            type="checkbox"
+                            checked={tempSearchValues().correctTypos}
+                            onChange={(e) => {
+                              setTempSearchValues((prev) => {
+                                return {
+                                  ...prev,
+                                  correctTypos: e.target.checked,
+                                };
+                              });
+                            }}
+                          />
+                        </div>
+                        <div class="items flex justify-between space-x-2 p-1">
+                          <label>One typo min word length:</label>
+                          <input
+                            class="w-16 rounded border border-neutral-400 p-0.5 text-black"
+                            type="number"
+                            step="any"
+                            value={tempSearchValues().oneTypoWordRangeMin}
+                            onChange={(e) => {
+                              setTempSearchValues((prev) => {
+                                return {
+                                  ...prev,
+                                  oneTypoWordRangeMin: parseInt(
+                                    e.currentTarget.value,
+                                  ),
+                                };
+                              });
+                            }}
+                          />
+                        </div>
+                        <div class="items flex justify-between space-x-2 p-1">
+                          <label>One typo max word length:</label>
+                          <input
+                            class="w-16 rounded border border-neutral-400 p-0.5 text-black"
+                            type="number"
+                            step="any"
+                            value={
+                              tempSearchValues().oneTypoWordRangeMax?.toString() ??
+                              ""
+                            }
+                            onChange={(e) => {
+                              setTempSearchValues((prev) => {
+                                return {
+                                  ...prev,
+                                  oneTypoWordRangeMax:
+                                    e.currentTarget.value === ""
+                                      ? null
+                                      : parseInt(e.currentTarget.value),
+                                };
+                              });
+                            }}
+                          />
+                        </div>
+                        <div class="items flex justify-between space-x-2 p-1">
+                          <label>Two typo min word length:</label>
+                          <input
+                            class="w-16 rounded border border-neutral-400 p-0.5 text-black"
+                            type="number"
+                            step="any"
+                            value={tempSearchValues().twoTypoWordRangeMin}
+                            onChange={(e) => {
+                              setTempSearchValues((prev) => {
+                                return {
+                                  ...prev,
+                                  twoTypoWordRangeMin: parseInt(
+                                    e.currentTarget.value,
+                                  ),
+                                };
+                              });
+                            }}
+                          />
+                        </div>
+                        <div class="items flex justify-between space-x-2 p-1">
+                          <label>Two typo max word length:</label>
+                          <input
+                            class="w-16 rounded border border-neutral-400 p-0.5 text-black"
+                            type="number"
+                            step="any"
+                            value={
+                              tempSearchValues().twoTypoWordRangeMax?.toString() ??
+                              ""
+                            }
+                            onChange={(e) => {
+                              setTempSearchValues((prev) => {
+                                return {
+                                  ...prev,
+                                  oneTypoWordRangeMax:
+                                    e.currentTarget.value === ""
+                                      ? null
+                                      : parseInt(e.currentTarget.value),
+                                };
+                              });
+                            }}
+                          />
+                        </div>
+                        <div class="items flex justify-between space-x-2 p-1">
+                          <label>Disable typo tolerance for words:</label>
+                          <input
+                            class="w-16 rounded border border-neutral-400 p-0.5 text-black"
+                            type="text"
+                            value={tempSearchValues().disableOnWords.join(",")}
+                            onInput={(e) => {
+                              if (e.currentTarget.value === " ") {
+                                setTempSearchValues((prev) => {
+                                  return {
+                                    ...prev,
+                                    disableOnWords: [" "],
+                                  };
+                                });
+                              }
+
+                              setTempSearchValues((prev) => {
+                                return {
+                                  ...prev,
+                                  disableOnWords:
+                                    e.currentTarget.value.split(","),
+                                };
+                              });
+                            }}
+                          />
+                        </div>
                         <div class="flex items-center justify-between space-x-2 p-1">
                           <label>Highlight Results (Latency Penalty):</label>
                           <input
diff --git a/frontends/search/src/hooks/useSearch.ts b/frontends/search/src/hooks/useSearch.ts
index 4fec4608fd..70eabdf6c4 100644
--- a/frontends/search/src/hooks/useSearch.ts
+++ b/frontends/search/src/hooks/useSearch.ts
@@ -45,6 +45,12 @@ export interface SearchOptions {
   extendResults: boolean;
   slimChunks: boolean;
   groupUniqueSearch: boolean;
+  correctTypos: boolean;
+  oneTypoWordRangeMin: number;
+  oneTypoWordRangeMax: number | null;
+  twoTypoWordRangeMin: number;
+  twoTypoWordRangeMax: number | null;
+  disableOnWords: string[];
   sort_by: SortByField | SortBySearchType;
   pageSize: number;
   getTotalPages: boolean;
@@ -75,6 +81,12 @@ const initalState: SearchOptions = {
   },
   pageSize: 10,
   getTotalPages: false,
+  correctTypos: false,
+  oneTypoWordRangeMin: 5,
+  oneTypoWordRangeMax: 8,
+  twoTypoWordRangeMin: 8,
+  twoTypoWordRangeMax: null,
+  disableOnWords: [],
   highlightResults: true,
   highlightStrategy: "exactmatch",
   highlightThreshold: 0.8,
@@ -106,6 +118,12 @@ const fromStateToParams = (state: SearchOptions): Params => {
     sort_by: JSON.stringify(state.sort_by),
     pageSize: state.pageSize.toString(),
     getTotalPages: state.getTotalPages.toString(),
+    correctTypos: state.correctTypos.toString(),
+    oneTypoWordRangeMin: state.oneTypoWordRangeMin.toString(),
+    oneTypoWordRangeMax: state.oneTypoWordRangeMax?.toString() ?? "8",
+    twoTypoWordRangeMin: state.twoTypoWordRangeMin.toString(),
+    twoTypoWordRangeMax: state.twoTypoWordRangeMax?.toString() ?? "",
+    disableOnWords: state.disableOnWords.join(","),
     highlightStrategy: state.highlightStrategy,
     highlightResults: state.highlightResults.toString(),
     highlightThreshold: state.highlightThreshold.toString(),
@@ -121,6 +139,13 @@ const fromStateToParams = (state: SearchOptions): Params => {
   };
 };
 
+const parseIntOrNull = (str: string | undefined) => {
+  if (!str || str === "") {
+    return null;
+  }
+  return parseInt(str);
+};
+
 const fromParamsToState = (
   params: Partial<Params>,
 ): Omit<SearchOptions, "version"> => {
@@ -136,6 +161,12 @@ const fromParamsToState = (
       initalState.sort_by,
     pageSize: parseInt(params.pageSize ?? "10"),
     getTotalPages: (params.getTotalPages ?? "false") === "true",
+    correctTypos: (params.correctTypos ?? "false") === "true",
+    oneTypoWordRangeMin: parseInt(params.oneTypoWordRangeMin ?? "5"),
+    oneTypoWordRangeMax: parseIntOrNull(params.oneTypoWordRangeMax),
+    twoTypoWordRangeMin: parseInt(params.oneTypoWordRangeMin ?? "8"),
+    twoTypoWordRangeMax: parseIntOrNull(params.twoTypoWordRangeMax),
+    disableOnWords: params.disableOnWords?.split(",") ?? [],
     highlightResults: (params.highlightResults ?? "true") === "true",
     highlightStrategy: isHighlightStrategy(params.highlightStrategy)
       ? params.highlightStrategy
diff --git a/helm/local-values.yaml b/helm/local-values.yaml
index 0afdea0ede..ff825a1973 100644
--- a/helm/local-values.yaml
+++ b/helm/local-values.yaml
@@ -22,6 +22,10 @@ containers:
     tag: latest
   sync_qdrant:
     tag: latest
+  bktree_worker:
+    tag: latest
+  word_worker:
+    tag: latest
   search:
     tag: latest
   chat:
diff --git a/helm/templates/bktree-worker-deployment.yaml b/helm/templates/bktree-worker-deployment.yaml
new file mode 100644
index 0000000000..a4c5e8c53c
--- /dev/null
+++ b/helm/templates/bktree-worker-deployment.yaml
@@ -0,0 +1,114 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: bktree-worker
+  labels:
+    app.kubernetes.io/name: bktree-worker
+    app.kubernetes.io/instance: {{ .Release.Name }}
+spec:
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: bktree-worker
+      app.kubernetes.io/instance: {{ .Release.Name }}
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/name: bktree-worker
+        app.kubernetes.io/instance: {{ .Release.Name }}
+    spec:
+      containers:
+      - name: bktree-worker
+        image: {{ printf "%s%s:%s" (ternary "trieve/" "localhost:5001/" (ne .Values.environment "local")) "bktree-worker" .Values.containers.bktree_worker.tag }}
+        env:
+          - name: ADMIN_API_KEY
+            value: {{ .Values.config.trieve.adminApiKey }}
+          - name: BASE_SERVER_URL
+            value: {{ .Values.config.trieve.baseServerUrl }}
+          - name: REDIS_URL
+            value: {{ ( ternary "redis://:redis@trieve-redis-master.default.svc.cluster.local:6379" .Values.config.redis.uri (eq .Values.config.redis.useSubchart true)) }}
+          - name: QDRANT_URL
+            value: {{ ( ternary "http://trieve-qdrant.default.svc.cluster.local:6334" .Values.config.qdrant.qdrantUrl (eq .Values.config.qdrant.useSubchart true)) }}
+          - name: QDRANT_API_KEY
+            value: {{ .Values.config.qdrant.apiKey }}
+          - name: QUANTIZE_VECTORS
+            value: {{ .Values.config.qdrant.quantizeVectors | quote }}
+          - name: REPLICATION_FACTOR
+            value: {{ .Values.config.qdrant.replicationFactor | quote }}
+          - name: DATABASE_URL
+            value: {{ ( ternary "postgres://postgres:password@trieve-postgresql.default.svc.cluster.local:5432/trieve" .Values.postgres.dbURI (eq .Values.postgres.useSubchart true)) }}
+          - name: SMTP_RELAY
+            value: {{ .Values.config.smtp.relay }}
+          - name: SMTP_USERNAME
+            value: {{ .Values.config.smtp.username }}
+          - name: SMTP_PASSWORD
+            value: {{ .Values.config.smtp.password }}
+          - name: SMTP_EMAIL_ADDRESS
+            value: {{ .Values.config.smtp.emailAddress }}
+          - name: OPENAI_API_KEY
+            value: {{ .Values.config.openai.apiKey }}
+          - name: LLM_API_KEY
+            value: {{ .Values.config.llm.apiKey }}
+          - name: SECRET_KEY
+            value: {{ .Values.config.trieve.secretKey | quote }}
+          - name: SALT
+            value: {{ .Values.config.trieve.salt }}
+          - name: S3_ENDPOINT
+            value: {{ .Values.config.s3.endpoint }}
+          - name: S3_ACCESS_KEY
+            value: {{ .Values.config.s3.accessKey }}
+          - name: S3_SECRET_KEY
+            value: {{ .Values.config.s3.secretKey }}
+          - name: S3_BUCKET
+            value: {{ .Values.config.s3.bucket }}
+          - name: COOKIE_SECURE
+            value: {{ .Values.config.trieve.cookieSecure | quote }}
+          - name: QDRANT_COLLECTION
+            value: {{ .Values.config.qdrant.collection }}
+          - name: TIKA_URL
+            value: http://tika.default.svc.cluster.local:9998
+          - name: OPENAI_BASE_URL
+            value: {{ .Values.config.openai.baseUrl }}
+          - name: STRIPE_SECRET
+            value: {{ .Values.config.stripe.secret }}
+          - name: STRIPE_WEBHOOK_SECRET
+            value: {{ .Values.config.stripe.webhookSecret }}
+          - name: ADMIN_DASHBOARD_URL
+            value: {{ .Values.config.trieve.adminDashboardUrl }}
+          - name: OIDC_CLIENT_SECRET
+            value: {{ .Values.config.oidc.clientSecret }}
+          - name: OIDC_CLIENT_ID
+            value: {{ .Values.config.oidc.clientId }}
+          - name: OIDC_AUTH_REDIRECT_URL
+            value: {{ .Values.config.oidc.authRedirectUrl }}
+          - name: OIDC_ISSUER_URL
+            value: {{ .Values.config.oidc.issuerUrl }}
+          - name: GPU_SERVER_ORIGIN
+            value: {{ .Values.config.trieve.gpuServerOrigin }}
+          - name: SPARSE_SERVER_QUERY_ORIGIN
+            value: {{ .Values.config.trieve.sparseServerQueryOrigin }}
+          - name: SPARSE_SERVER_DOC_ORIGIN
+            value: {{ .Values.config.trieve.sparseServerDocOrigin }}
+          - name: SPARSE_SERVER_ORIGIN
+            value: {{ .Values.config.trieve.sparseServerOrigin }}
+          - name: EMBEDDING_SERVER_ORIGIN
+            value: {{ .Values.config.trieve.embeddingServerOrigin }}
+          - name: EMBEDDING_SERVER_ORIGIN_BGEM3
+            value: {{ .Values.config.trieve.embeddingServerOriginBGEM3 }}
+          - name: RERANKER_SERVER_ORIGIN
+            value: {{ .Values.config.trieve.rerankerServerOrigin }}
+          - name: UNLIMITED
+            value: {{ .Values.config.trieve.unlimited | quote }}
+          - name: REDIS_CONNECTIONS
+            value: "2"
+          - name: AWS_REGION
+            value: {{ .Values.config.s3.region }}
+          - name: CLICKHOUSE_URL
+            value: {{ .Values.config.analytics.clickhouseUrl | quote }}
+          - name: CLICKHOUSE_DB
+            value: {{ .Values.config.analytics.clickhouseDB | quote }}
+          - name: CLICKHOUSE_USER
+            value: {{ .Values.config.analytics.clickhouseUser | quote }}
+          - name: CLICKHOUSE_PASSWORD
+            value: {{ .Values.config.analytics.clickhousePassword | quote }}
+          - name: USE_ANALYTICS
+            value: {{ .Values.config.analytics.enabled | quote }}
diff --git a/helm/templates/wordworker-deployment.yaml b/helm/templates/wordworker-deployment.yaml
new file mode 100644
index 0000000000..563be23c9f
--- /dev/null
+++ b/helm/templates/wordworker-deployment.yaml
@@ -0,0 +1,146 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: word-worker
+  labels:
+    app.kubernetes.io/name: word-worker
+    app.kubernetes.io/instance: {{ .Release.Name }}
+spec:
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: word-worker
+      app.kubernetes.io/instance: {{ .Release.Name }}
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/name: word-worker
+        app.kubernetes.io/instance: {{ .Release.Name }}
+    spec:
+      serviceAccountName: cloud-postgres-service-account
+      containers:
+      - name: word-worker
+        image: {{ printf "%s%s:%s" (ternary "trieve/" "localhost:5001/" (ne .Values.environment "local")) "word-worker" .Values.containers.word_worker.tag }}
+        env:
+          - name: ADMIN_API_KEY
+            value: {{ .Values.config.trieve.adminApiKey }}
+          - name: BASE_SERVER_URL
+            value: {{ .Values.config.trieve.baseServerUrl }}
+          - name: REDIS_URL
+            value: {{ ( ternary "redis://:redis@trieve-redis-master.default.svc.cluster.local:6379" .Values.config.redis.uri (eq .Values.config.redis.useSubchart true)) }}
+          - name: QDRANT_URL
+            value: {{ ( ternary "http://trieve-qdrant.default.svc.cluster.local:6334" .Values.config.qdrant.qdrantUrl (eq .Values.config.qdrant.useSubchart true)) }}
+          - name: QDRANT_API_KEY
+            value: {{ .Values.config.qdrant.apiKey }}
+          - name: QUANTIZE_VECTORS
+            value: {{ .Values.config.qdrant.quantizeVectors | quote }}
+          - name: REPLICATION_FACTOR
+            value: {{ .Values.config.qdrant.replicationFactor | quote }}
+          - name: DATABASE_URL
+            value: {{ ( ternary "postgres://postgres:password@trieve-postgresql.default.svc.cluster.local:5432/trieve" .Values.postgres.dbURI (eq .Values.postgres.useSubchart true)) }}
+          - name: SMTP_RELAY
+            value: {{ .Values.config.smtp.relay }}
+          - name: SMTP_USERNAME
+            value: {{ .Values.config.smtp.username }}
+          - name: SMTP_PASSWORD
+            value: {{ .Values.config.smtp.password }}
+          - name: SMTP_EMAIL_ADDRESS
+            value: {{ .Values.config.smtp.emailAddress }}
+          - name: OPENAI_API_KEY
+            value: {{ .Values.config.openai.apiKey }}
+          - name: LLM_API_KEY
+            value: {{ .Values.config.llm.apiKey }}
+          - name: SECRET_KEY
+            value: {{ .Values.config.trieve.secretKey | quote }}
+          - name: SALT
+            value: {{ .Values.config.trieve.salt }}
+          - name: S3_ENDPOINT
+            value: {{ .Values.config.s3.endpoint }}
+          - name: S3_ACCESS_KEY
+            value: {{ .Values.config.s3.accessKey }}
+          - name: S3_SECRET_KEY
+            value: {{ .Values.config.s3.secretKey }}
+          - name: S3_BUCKET
+            value: {{ .Values.config.s3.bucket }}
+          - name: COOKIE_SECURE
+            value: {{ .Values.config.trieve.cookieSecure | quote }}
+          - name: QDRANT_COLLECTION
+            value: {{ .Values.config.qdrant.collection }}
+          - name: TIKA_URL
+            value: http://tika.default.svc.cluster.local:9998
+          - name: OPENAI_BASE_URL
+            value: {{ .Values.config.openai.baseUrl }}
+          - name: STRIPE_SECRET
+            value: {{ .Values.config.stripe.secret }}
+          - name: STRIPE_WEBHOOK_SECRET
+            value: {{ .Values.config.stripe.webhookSecret }}
+          - name: ADMIN_DASHBOARD_URL
+            value: {{ .Values.config.trieve.adminDashboardUrl }}
+          - name: OIDC_CLIENT_SECRET
+            value: {{ .Values.config.oidc.clientSecret }}
+          - name: OIDC_CLIENT_ID
+            value: {{ .Values.config.oidc.clientId }}
+          - name: OIDC_AUTH_REDIRECT_URL
+            value: {{ .Values.config.oidc.authRedirectUrl }}
+          - name: OIDC_ISSUER_URL
+            value: {{ .Values.config.oidc.issuerUrl }}
+          - name: GPU_SERVER_ORIGIN
+            value: {{ .Values.config.trieve.gpuServerOrigin }}
+          - name: SPARSE_SERVER_QUERY_ORIGIN
+            value: {{ .Values.config.trieve.sparseServerQueryOrigin }}
+          - name: SPARSE_SERVER_DOC_ORIGIN
+            value: {{ .Values.config.trieve.sparseServerDocOrigin }}
+          - name: SPARSE_SERVER_ORIGIN
+            value: {{ .Values.config.trieve.sparseServerOrigin }}
+          - name: EMBEDDING_SERVER_ORIGIN
+            value: {{ .Values.config.trieve.embeddingServerOrigin }}
+          - name: EMBEDDING_SERVER_ORIGIN_BGEM3
+            value: {{ .Values.config.trieve.embeddingServerOriginBGEM3 }}
+          - name: RERANKER_SERVER_ORIGIN
+            value: {{ .Values.config.trieve.rerankerServerOrigin }}
+          - name: UNLIMITED
+            value: {{ .Values.config.trieve.unlimited | quote }}
+          - name: REDIS_CONNECTIONS
+            value: "2"
+          - name: AWS_REGION
+            value: {{ .Values.config.s3.region }}
+          - name: CLICKHOUSE_URL
+            value: {{ .Values.config.analytics.clickhouseUrl | quote }}
+          - name: CLICKHOUSE_DB
+            value: {{ .Values.config.analytics.clickhouseDB | quote }}
+          - name: CLICKHOUSE_USER
+            value: {{ .Values.config.analytics.clickhouseUser | quote }}
+          - name: CLICKHOUSE_PASSWORD
+            value: {{ .Values.config.analytics.clickhousePassword | quote }}
+          - name: USE_ANALYTICS
+            value: {{ .Values.config.analytics.enabled | quote }}
+      {{- if eq $.Values.environment "gcloud" }}
+      - name: cloud-sql-proxy
+        # It is recommended to use the latest version of the Cloud SQL Auth Proxy
+        # Make sure to update on a regular schedule!
+        image: gcr.io/cloud-sql-connectors/cloud-sql-proxy:2.8.0
+        args:
+          - "--structured-logs"
+          - "--auto-iam-authn"
+          # Replace DB_PORT with the port the proxy should listen on
+          - "--port=5432"
+          - "studious-lore-405302:us-west1:trieve-cloud"
+        securityContext:
+          # The default Cloud SQL Auth Proxy image runs as the
+          # "nonroot" user and group (uid: 65532) by default.
+          runAsNonRoot: true
+        # You should use resource requests/limits as a best practice to prevent
+        # pods from consuming too many resources and affecting the execution of
+        # other pods. You should adjust the following values based on what your
+        # application needs. For details, see
+        # https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/
+        resources:
+          requests:
+            # The proxy's memory use scales linearly with the number of active
+            # connections. Fewer open connections will use less memory. Adjust
+            # this value based on your application's requirements.
+            memory: "2Gi"
+            # The proxy's CPU use scales linearly with the amount of IO between
+            # the database and the application. Adjust this value based on your
+            # application's requirements.
+            cpu:    "1"
+      {{- end }}
diff --git a/helm/values.yaml.tpl b/helm/values.yaml.tpl
index c4e08e1cce..233c1764e7 100644
--- a/helm/values.yaml.tpl
+++ b/helm/values.yaml.tpl
@@ -23,6 +23,10 @@ containers:
     tag: latest
   sync_qdrant:
     tag: latest
+  bktree_worker:
+    tag: latest
+  word_worker:
+    tag: latest
   search:
     tag: latest
   chat:
diff --git a/scripts/reset-bktree.sh b/scripts/reset-bktree.sh
new file mode 100644
index 0000000000..96fd743900
--- /dev/null
+++ b/scripts/reset-bktree.sh
@@ -0,0 +1,61 @@
+#!/bin/bash
+
+# Default connection details
+REDIS_HOST="localhost"
+REDIS_PORT="6379"
+REDIS_PASSWORD=""
+CLICKHOUSE_HOST="localhost"
+CLICKHOUSE_PORT="8123"
+CLICKHOUSE_USER="default"
+CLICKHOUSE_PASSWORD="password"
+CLICKHOUSE_DB="default"
+
+# Function to print usage
+usage() {
+    echo "Usage: $0 -d <dataset_id> [-rh <redis_host>] [-rp <redis_port>] [-rw <redis_password>] [-ch <clickhouse_host>] [-cp <clickhouse_port>] [-cu <clickhouse_user>] [-cw <clickhouse_password>] [-cd <clickhouse_db>]"
+    exit 1
+}
+
+# Parse command line arguments
+while getopts "d:rh:rp:rw:ch:cp:cu:cw:cd:" opt; do
+    case $opt in
+        d) DATASET_ID="$OPTARG" ;;
+        rh) REDIS_HOST="$OPTARG" ;;
+        rp) REDIS_PORT="$OPTARG" ;;
+        rw) REDIS_PASSWORD="$OPTARG" ;;
+        ch) CLICKHOUSE_HOST="$OPTARG" ;;
+        cp) CLICKHOUSE_PORT="$OPTARG" ;;
+        cu) CLICKHOUSE_USER="$OPTARG" ;;
+        cw) CLICKHOUSE_PASSWORD="$OPTARG" ;;
+        cd) CLICKHOUSE_DB="$OPTARG" ;;
+        *) usage ;;
+    esac
+done
+
+# Check if dataset_id is provided
+if [ -z "$DATASET_ID" ]; then
+    echo "Error: dataset_id is required"
+    usage
+fi
+
+# Construct Redis CLI command
+REDIS_CMD="redis-cli -h $REDIS_HOST -p $REDIS_PORT"
+if [ -n "$REDIS_PASSWORD" ]; then
+    REDIS_CMD="$REDIS_CMD -a $REDIS_PASSWORD"
+fi
+
+# Delete key from Redis
+echo "Deleting key *$DATASET_ID from Redis..."
+$REDIS_CMD DEL "*$DATASET_ID"
+
+# Delete row from ClickHouse
+echo "Deleting row with dataset_id=$DATASET_ID from ClickHouse..."
+clickhouse-client \
+    --host "$CLICKHOUSE_HOST" \
+    --port "$CLICKHOUSE_PORT" \
+    --user "$CLICKHOUSE_USER" \
+    --password "$CLICKHOUSE_PASSWORD" \
+    --database "$CLICKHOUSE_DB" \
+    --query "ALTER TABLE dataset_words_last_processed DELETE WHERE dataset_id = '$DATASET_ID'"
+
+echo "Cleanup completed for dataset_id: $DATASET_ID"
\ No newline at end of file
diff --git a/server/Cargo.lock b/server/Cargo.lock
index a2fb633736..1a61a1c36f 100644
--- a/server/Cargo.lock
+++ b/server/Cargo.lock
@@ -722,6 +722,15 @@ dependencies = [
  "redis 0.25.4",
 ]
 
+[[package]]
+name = "bincode"
+version = "1.3.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b1f45e9417d87227c7a56d22e471c6206462cba514c7590c09aff4cf6d1ddcad"
+dependencies = [
+ "serde",
+]
+
 [[package]]
 name = "bitflags"
 version = "1.3.2"
@@ -743,6 +752,15 @@ dependencies = [
  "crunchy",
 ]
 
+[[package]]
+name = "bktree"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0bb1e744816f6a3b9e962186091867f3e5959d4dac995777ec254631cb00b21c"
+dependencies = [
+ "num",
+]
+
 [[package]]
 name = "blake2b_simd"
 version = "1.0.2"
@@ -1149,6 +1167,19 @@ dependencies = [
  "cfg-if",
 ]
 
+[[package]]
+name = "crossbeam"
+version = "0.8.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1137cd7e7fc0fb5d3c5a8678be38ec56e819125d8d7907411fe24ccb943faca8"
+dependencies = [
+ "crossbeam-channel",
+ "crossbeam-deque",
+ "crossbeam-epoch",
+ "crossbeam-queue",
+ "crossbeam-utils",
+]
+
 [[package]]
 name = "crossbeam-channel"
 version = "0.5.13"
@@ -1177,6 +1208,15 @@ dependencies = [
  "crossbeam-utils",
 ]
 
+[[package]]
+name = "crossbeam-queue"
+version = "0.3.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "df0346b5d5e76ac2fe4e327c5fd1118d6be7c51dfb18f9b7922923f287471e35"
+dependencies = [
+ "crossbeam-utils",
+]
+
 [[package]]
 name = "crossbeam-utils"
 version = "0.8.20"
@@ -1737,9 +1777,9 @@ dependencies = [
 
 [[package]]
 name = "flate2"
-version = "1.0.30"
+version = "1.0.31"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5f54427cfd1c7829e2a139fcefea601bf088ebca651d2bf53ebc600eac295dae"
+checksum = "7f211bbe8e69bbd0cfdea405084f128ae8b4aaa6b0b522fc8f2b009084797920"
 dependencies = [
  "crc32fast",
  "miniz_oxide",
@@ -2895,6 +2935,30 @@ dependencies = [
  "winapi",
 ]
 
+[[package]]
+name = "num"
+version = "0.4.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "35bd024e8b2ff75562e5f34e7f4905839deb4b22955ef5e73d2fea1b9813cb23"
+dependencies = [
+ "num-bigint",
+ "num-complex",
+ "num-integer",
+ "num-iter",
+ "num-rational",
+ "num-traits",
+]
+
+[[package]]
+name = "num-bigint"
+version = "0.4.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a5e44f723f1133c9deac646763579fdb3ac745e418f2a7af9cd0c431da1f20b9"
+dependencies = [
+ "num-integer",
+ "num-traits",
+]
+
 [[package]]
 name = "num-bigint-dig"
 version = "0.8.4"
@@ -2947,6 +3011,17 @@ dependencies = [
  "num-traits",
 ]
 
+[[package]]
+name = "num-rational"
+version = "0.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f83d14da390562dca69fc84082e73e548e1ad308d24accdedd2720017cb37824"
+dependencies = [
+ "num-bigint",
+ "num-integer",
+ "num-traits",
+]
+
 [[package]]
 name = "num-traits"
 version = "0.2.19"
@@ -5678,11 +5753,14 @@ dependencies = [
  "async-stripe",
  "base64 0.22.1",
  "bb8-redis",
+ "bincode",
+ "bktree",
  "blake3",
  "cfg-if",
  "chm",
  "chrono",
  "clickhouse 0.12.0",
+ "crossbeam",
  "crossbeam-channel",
  "dateparser",
  "derive_more",
@@ -5690,12 +5768,14 @@ dependencies = [
  "diesel-async",
  "diesel_migrations",
  "dotenvy",
+ "flate2",
  "futures",
  "futures-util",
  "glob",
  "itertools 0.13.0",
  "lazy_static",
  "lettre",
+ "levenshtein_automata",
  "log",
  "murmur3",
  "ndarray",
@@ -5708,6 +5788,7 @@ dependencies = [
  "prometheus",
  "qdrant-client",
  "rand 0.8.5",
+ "rayon",
  "redis 0.25.4",
  "regex",
  "regex-split",
@@ -5723,6 +5804,7 @@ dependencies = [
  "signal-hook",
  "simple-server-timing-header",
  "simsearch",
+ "strsim 0.11.1",
  "tantivy",
  "time",
  "tokio",
diff --git a/server/Cargo.toml b/server/Cargo.toml
index ed093a13b6..6bab8d5790 100644
--- a/server/Cargo.toml
+++ b/server/Cargo.toml
@@ -17,10 +17,22 @@ panic = "abort"
 name = "trieve-server"
 path = "src/main.rs"
 
+[[bin]]
+name = "word-id-cronjob"
+path = "src/bin/word-id-cronjob.rs"
+
 [[bin]]
 name = "ingestion-worker"
 path = "src/bin/ingestion-worker.rs"
 
+[[bin]]
+name = "bktree-worker"
+path = "src/bin/bktree-worker.rs"
+
+[[bin]]
+name = "word-worker"
+path = "src/bin/word-worker.rs"
+
 [[bin]]
 name = "file-worker"
 path = "src/bin/file-worker.rs"
@@ -106,7 +118,7 @@ reqwest = { version = "0.12.2", features = ["json"] }
 rand = "0.8.5"
 dotenvy = "0.15.7"
 simsearch = "0.2.4"
-lazy_static = { version = "1.4.0" }
+lazy_static = "1.4.0"
 actix-files = "0.6.2"
 utoipa = { version = "4.2", features = [
     "actix_extras",
@@ -148,6 +160,14 @@ prometheus = "0.13.4"
 chm = "0.1.16"
 murmur3 = "0.5.2"
 tantivy = "0.22.0"
+strsim = "0.11.1"
+levenshtein_automata = "0.2.1"
+bktree = "1.0.1"
+flate2 = "1.0.31"
+bincode = "1.3"
+rayon = "1.10.0"
+crossbeam = "0.8.4"
+
 
 [build-dependencies]
 dotenvy = "0.15.7"
diff --git a/server/Dockerfile.bktree-worker b/server/Dockerfile.bktree-worker
new file mode 100644
index 0000000000..007be040f5
--- /dev/null
+++ b/server/Dockerfile.bktree-worker
@@ -0,0 +1,28 @@
+FROM rust:1.80-slim-bookworm AS chef
+# We only pay the installation cost once, 
+# it will be cached from the second build onwards
+RUN apt-get update -y && apt-get -y install pkg-config libssl-dev libpq-dev g++ curl
+RUN cargo install cargo-chef 
+WORKDIR app
+
+FROM chef AS planner
+COPY . .
+RUN cargo chef prepare  --recipe-path recipe.json
+
+FROM chef AS builder
+COPY --from=planner /app/recipe.json recipe.json
+# Build dependencies - this is the caching Docker layer!
+RUN cargo chef cook --release --recipe-path recipe.json --bin "bktree-worker"
+# Build application
+COPY . .
+RUN cargo build --release --features "runtime-env" --bin "bktree-worker"
+
+FROM debian:bookworm-slim as runtime
+RUN apt-get update -y && apt-get -y install pkg-config libssl-dev libpq-dev ca-certificates 
+WORKDIR /app
+COPY ./migrations/ /app/migrations
+COPY --from=builder /app/target/release/bktree-worker /app/bktree-worker
+
+
+EXPOSE 8090
+ENTRYPOINT ["/app/bktree-worker"]
diff --git a/server/Dockerfile.word-id-cronjob b/server/Dockerfile.word-id-cronjob
new file mode 100644
index 0000000000..6cc82d7137
--- /dev/null
+++ b/server/Dockerfile.word-id-cronjob
@@ -0,0 +1,28 @@
+FROM rust:1.80-slim-bookworm AS chef
+# We only pay the installation cost once, 
+# it will be cached from the second build onwards
+RUN apt-get update -y && apt-get -y install pkg-config libssl-dev libpq-dev g++ curl
+RUN cargo install cargo-chef 
+WORKDIR app
+
+FROM chef AS planner
+COPY . .
+RUN cargo chef prepare  --recipe-path recipe.json
+
+FROM chef AS builder
+COPY --from=planner /app/recipe.json recipe.json
+# Build dependencies - this is the caching Docker layer!
+RUN cargo chef cook --release --recipe-path recipe.json --bin "word-id-cronjob"
+# Build application
+COPY . .
+RUN cargo build --release --features "runtime-env" --bin "word-id-cronjob"
+
+FROM debian:bookworm-slim as runtime
+RUN apt-get update -y && apt-get -y install pkg-config libssl-dev libpq-dev ca-certificates 
+WORKDIR /app
+COPY ./migrations/ /app/migrations
+COPY --from=builder /app/target/release/word-id-cronjob /app/word-id-cronjob
+
+
+EXPOSE 8090
+ENTRYPOINT ["/app/word-id-cronjob"]
diff --git a/server/Dockerfile.word-worker b/server/Dockerfile.word-worker
new file mode 100644
index 0000000000..454f1d1676
--- /dev/null
+++ b/server/Dockerfile.word-worker
@@ -0,0 +1,28 @@
+FROM rust:1.80-slim-bookworm AS chef
+# We only pay the installation cost once, 
+# it will be cached from the second build onwards
+RUN apt-get update -y && apt-get -y install pkg-config libssl-dev libpq-dev g++ curl
+RUN cargo install cargo-chef 
+WORKDIR app
+
+FROM chef AS planner
+COPY . .
+RUN cargo chef prepare  --recipe-path recipe.json
+
+FROM chef AS builder
+COPY --from=planner /app/recipe.json recipe.json
+# Build dependencies - this is the caching Docker layer!
+RUN cargo chef cook --release --recipe-path recipe.json --bin "word-worker"
+# Build application
+COPY . .
+RUN cargo build --release --features "runtime-env" --bin "word-worker"
+
+FROM debian:bookworm-slim as runtime
+RUN apt-get update -y && apt-get -y install pkg-config libssl-dev libpq-dev ca-certificates 
+WORKDIR /app
+COPY ./migrations/ /app/migrations
+COPY --from=builder /app/target/release/word-worker /app/word-worker
+
+
+EXPOSE 8090
+ENTRYPOINT ["/app/word-worker"]
diff --git a/server/ch_migrations/1723258343_store_words_in_clickhouse/down.sql b/server/ch_migrations/1723258343_store_words_in_clickhouse/down.sql
new file mode 100644
index 0000000000..8eb04f3b52
--- /dev/null
+++ b/server/ch_migrations/1723258343_store_words_in_clickhouse/down.sql
@@ -0,0 +1,2 @@
+
+DROP TABLE IF EXISTS words_datasets;
diff --git a/server/ch_migrations/1723258343_store_words_in_clickhouse/up.sql b/server/ch_migrations/1723258343_store_words_in_clickhouse/up.sql
new file mode 100644
index 0000000000..28ac871785
--- /dev/null
+++ b/server/ch_migrations/1723258343_store_words_in_clickhouse/up.sql
@@ -0,0 +1,12 @@
+CREATE TABLE IF NOT EXISTS words_datasets (
+    id UUID NOT NULL,
+    dataset_id UUID NOT NULL,
+    word String NOT NULL,
+    count Int32 NOT NULL,
+    created_at DateTime DEFAULT now() NOT NULL,
+    INDEX idx_created_at created_at TYPE minmax GRANULARITY 8192,
+    INDEX idx_id id TYPE minmax GRANULARITY 8192
+) ENGINE = SummingMergeTree(created_at)
+ORDER BY (dataset_id, word)
+PARTITION BY dataset_id;
+
diff --git a/server/ch_migrations/1723490007_create_last_processed_table/down.sql b/server/ch_migrations/1723490007_create_last_processed_table/down.sql
new file mode 100644
index 0000000000..484495f3ac
--- /dev/null
+++ b/server/ch_migrations/1723490007_create_last_processed_table/down.sql
@@ -0,0 +1 @@
+DROP TABLE IF EXISTS dataset_words_last_processed;
diff --git a/server/ch_migrations/1723490007_create_last_processed_table/up.sql b/server/ch_migrations/1723490007_create_last_processed_table/up.sql
new file mode 100644
index 0000000000..e2b23894dd
--- /dev/null
+++ b/server/ch_migrations/1723490007_create_last_processed_table/up.sql
@@ -0,0 +1,6 @@
+CREATE TABLE IF NOT EXISTS dataset_words_last_processed (
+	last_processed DateTime DEFAULT now() NOT NULL,
+	dataset_id UUID NOT NULL,
+) ENGINE = ReplacingMergeTree(last_processed)
+ORDER BY (dataset_id)
+PARTITION BY dataset_id;
diff --git a/server/migrations/2024-08-02-200541_word_dataset_table/down.sql b/server/migrations/2024-08-02-200541_word_dataset_table/down.sql
new file mode 100644
index 0000000000..a8ab58383e
--- /dev/null
+++ b/server/migrations/2024-08-02-200541_word_dataset_table/down.sql
@@ -0,0 +1,7 @@
+-- This file should undo anything in `up.sql`
+
+DROP TABLE IF EXISTS "words_datasets";
+DROP TABLE IF EXISTS "words_in_datasets";
+DROP TABLE IF EXISTS "dataset_words_last_processed";
+
+
diff --git a/server/migrations/2024-08-02-200541_word_dataset_table/up.sql b/server/migrations/2024-08-02-200541_word_dataset_table/up.sql
new file mode 100644
index 0000000000..ab43d0ae6f
--- /dev/null
+++ b/server/migrations/2024-08-02-200541_word_dataset_table/up.sql
@@ -0,0 +1,25 @@
+-- Your SQL goes here
+
+CREATE TABLE IF NOT EXISTS "words_in_datasets" (
+	id UUID PRIMARY KEY,
+	word TEXT NOT NULL,
+	UNIQUE(word)
+);
+
+CREATE TABLE IF NOT EXISTS "words_datasets" (
+	id UUID PRIMARY KEY,
+	dataset_id UUID NOT NULL,
+	word_id UUID NOT NULL,
+	count INT NOT NULL,
+	UNIQUE(dataset_id, word_id),
+	FOREIGN KEY (dataset_id) REFERENCES "datasets"(id) ON DELETE CASCADE,
+	FOREIGN KEY (word_id) REFERENCES "words_in_datasets"(id) ON DELETE CASCADE
+);
+
+CREATE TABLE IF NOT EXISTS "dataset_words_last_processed" (
+	id UUID PRIMARY KEY,
+	last_processed TIMESTAMP NULL,
+	dataset_id UUID NOT NULL,
+	FOREIGN KEY (dataset_id) REFERENCES "datasets"(id) ON DELETE CASCADE,
+	UNIQUE(dataset_id)
+);
diff --git a/server/migrations/2024-08-09-013645_update_dataset_updated_at/down.sql b/server/migrations/2024-08-09-013645_update_dataset_updated_at/down.sql
new file mode 100644
index 0000000000..2b44495d86
--- /dev/null
+++ b/server/migrations/2024-08-09-013645_update_dataset_updated_at/down.sql
@@ -0,0 +1,4 @@
+-- This file should undo anything in `up.sql`
+-- Finally, let's drop the trigger and function
+DROP TRIGGER IF EXISTS trigger_update_dataset_timestamp ON chunk_metadata;
+DROP FUNCTION IF EXISTS update_dataset_timestamp();
\ No newline at end of file
diff --git a/server/migrations/2024-08-09-013645_update_dataset_updated_at/up.sql b/server/migrations/2024-08-09-013645_update_dataset_updated_at/up.sql
new file mode 100644
index 0000000000..c856109716
--- /dev/null
+++ b/server/migrations/2024-08-09-013645_update_dataset_updated_at/up.sql
@@ -0,0 +1,39 @@
+CREATE OR REPLACE FUNCTION update_chunk_metadata_counts()
+RETURNS TRIGGER AS $$
+DECLARE
+    d_id UUID;
+    new_count INT;
+BEGIN
+    SELECT dataset_id INTO d_id FROM modified WHERE dataset_id IS NOT NULL LIMIT 1;
+    IF d_id IS NULL THEN
+        RETURN NULL;
+    END IF;
+    SELECT COUNT(modified.id) INTO new_count FROM modified;
+
+    IF TG_OP = 'INSERT' THEN
+        -- Update dataset_usage_counts
+        INSERT INTO dataset_usage_counts (dataset_id, chunk_count)
+        VALUES (d_id, new_count)
+        ON CONFLICT (dataset_id) DO UPDATE
+        SET chunk_count = dataset_usage_counts.chunk_count + new_count;
+
+        -- Update dataset
+        UPDATE datasets
+        SET updated_at = CURRENT_TIMESTAMP
+        WHERE id = d_id;
+
+    ELSIF TG_OP = 'DELETE' THEN
+        -- Update dataset_usage_counts
+        UPDATE dataset_usage_counts
+        SET chunk_count = dataset_usage_counts.chunk_count - new_count
+        WHERE dataset_id = d_id;
+
+        -- Update dataset
+        UPDATE datasets
+        SET updated_at = CURRENT_TIMESTAMP
+        WHERE id = d_id;
+    END IF;
+
+    RETURN NULL;
+END;
+$$ LANGUAGE plpgsql;
diff --git a/server/migrations/2024-08-09-024547_add_created_at_to_word/down.sql b/server/migrations/2024-08-09-024547_add_created_at_to_word/down.sql
new file mode 100644
index 0000000000..330b2abf25
--- /dev/null
+++ b/server/migrations/2024-08-09-024547_add_created_at_to_word/down.sql
@@ -0,0 +1,2 @@
+-- This file should undo anything in `up.sql`
+ALTER TABLE words_datasets DROP COLUMN IF EXISTS created_at;
\ No newline at end of file
diff --git a/server/migrations/2024-08-09-024547_add_created_at_to_word/up.sql b/server/migrations/2024-08-09-024547_add_created_at_to_word/up.sql
new file mode 100644
index 0000000000..1ca1af8298
--- /dev/null
+++ b/server/migrations/2024-08-09-024547_add_created_at_to_word/up.sql
@@ -0,0 +1,3 @@
+-- Your SQL goes here
+ALTER TABLE words_datasets
+ADD COLUMN IF NOT EXISTS created_at TIMESTAMP NOT NULL DEFAULT NOW();
\ No newline at end of file
diff --git a/server/migrations/2024-08-10-032512_delete_tables/down.sql b/server/migrations/2024-08-10-032512_delete_tables/down.sql
new file mode 100644
index 0000000000..8526f01622
--- /dev/null
+++ b/server/migrations/2024-08-10-032512_delete_tables/down.sql
@@ -0,0 +1,16 @@
+-- This file should undo anything in `up.sql`
+CREATE TABLE IF NOT EXISTS "words_in_datasets" (
+	id UUID PRIMARY KEY,
+	word TEXT NOT NULL,
+	UNIQUE(word)
+);
+
+CREATE TABLE IF NOT EXISTS "words_datasets" (
+	id UUID PRIMARY KEY,
+	dataset_id UUID NOT NULL,
+	word_id UUID NOT NULL,
+	count INT NOT NULL,
+	UNIQUE(dataset_id, word_id),
+	FOREIGN KEY (dataset_id) REFERENCES "datasets"(id) ON DELETE CASCADE,
+	FOREIGN KEY (word_id) REFERENCES "words_in_datasets"(id) ON DELETE CASCADE
+);
diff --git a/server/migrations/2024-08-10-032512_delete_tables/up.sql b/server/migrations/2024-08-10-032512_delete_tables/up.sql
new file mode 100644
index 0000000000..591cd969c6
--- /dev/null
+++ b/server/migrations/2024-08-10-032512_delete_tables/up.sql
@@ -0,0 +1,5 @@
+-- Your SQL goes here
+DROP TABLE IF EXISTS "words_datasets";
+DROP TABLE IF EXISTS "words_in_datasets";
+
+
diff --git a/server/migrations/2024-08-12-191216_delete_last_processed_table/down.sql b/server/migrations/2024-08-12-191216_delete_last_processed_table/down.sql
new file mode 100644
index 0000000000..a8589faf50
--- /dev/null
+++ b/server/migrations/2024-08-12-191216_delete_last_processed_table/down.sql
@@ -0,0 +1,8 @@
+-- This file should undo anything in `up.sql`
+CREATE TABLE IF NOT EXISTS "dataset_words_last_processed" (
+	id UUID PRIMARY KEY,
+	last_processed TIMESTAMP NULL,
+	dataset_id UUID NOT NULL,
+	FOREIGN KEY (dataset_id) REFERENCES "datasets"(id) ON DELETE CASCADE,
+	UNIQUE(dataset_id)
+);
diff --git a/server/migrations/2024-08-12-191216_delete_last_processed_table/up.sql b/server/migrations/2024-08-12-191216_delete_last_processed_table/up.sql
new file mode 100644
index 0000000000..6eab645e73
--- /dev/null
+++ b/server/migrations/2024-08-12-191216_delete_last_processed_table/up.sql
@@ -0,0 +1,2 @@
+-- Your SQL goes 
+DROP TABLE IF EXISTS "dataset_words_last_processed";
\ No newline at end of file
diff --git a/server/src/bin/bktree-worker.rs b/server/src/bin/bktree-worker.rs
new file mode 100644
index 0000000000..122fb6def7
--- /dev/null
+++ b/server/src/bin/bktree-worker.rs
@@ -0,0 +1,387 @@
+use std::sync::{
+    atomic::{AtomicBool, Ordering},
+    Arc,
+};
+
+use chm::tools::migrations::SetupArgs;
+use rand::Rng;
+use sentry::{Hub, SentryFutureExt};
+use signal_hook::consts::SIGTERM;
+use tracing_subscriber::{prelude::*, EnvFilter, Layer};
+use trieve_server::{
+    data::models::RedisPool,
+    errors::ServiceError,
+    get_env,
+    operators::{
+        chunk_operator::get_last_processed_from_clickhouse,
+        dataset_operator::{scroll_words_from_dataset, update_dataset_last_processed_query},
+        words_operator::{BkTree, CreateBkTreeMessage},
+    },
+};
+
+#[allow(clippy::print_stdout)]
+fn main() {
+    dotenvy::dotenv().ok();
+    let sentry_url = std::env::var("SENTRY_URL");
+    let _guard = if let Ok(sentry_url) = sentry_url {
+        let guard = sentry::init((
+            sentry_url,
+            sentry::ClientOptions {
+                release: sentry::release_name!(),
+                traces_sample_rate: 1.0,
+                ..Default::default()
+            },
+        ));
+
+        tracing_subscriber::Registry::default()
+            .with(sentry::integrations::tracing::layer())
+            .with(
+                tracing_subscriber::fmt::layer().with_filter(
+                    EnvFilter::from_default_env()
+                        .add_directive(tracing_subscriber::filter::LevelFilter::INFO.into()),
+                ),
+            )
+            .init();
+
+        log::info!("Sentry monitoring enabled");
+        Some(guard)
+    } else {
+        tracing_subscriber::Registry::default()
+            .with(
+                tracing_subscriber::fmt::layer().with_filter(
+                    EnvFilter::from_default_env()
+                        .add_directive(tracing_subscriber::filter::LevelFilter::INFO.into()),
+                ),
+            )
+            .init();
+
+        None
+    };
+
+    let should_terminate = Arc::new(AtomicBool::new(false));
+    signal_hook::flag::register(SIGTERM, Arc::clone(&should_terminate))
+        .expect("Failed to register shutdown hook");
+
+    tokio::runtime::Builder::new_current_thread()
+        .enable_all()
+        .build()
+        .expect("Failed to create tokio runtime")
+        .block_on(
+            async move {
+                let redis_url = get_env!("REDIS_URL", "REDIS_URL is not set");
+                let redis_connections: u32 = std::env::var("REDIS_CONNECTIONS")
+                    .unwrap_or("2".to_string())
+                    .parse()
+                    .unwrap_or(2);
+
+                let redis_manager = bb8_redis::RedisConnectionManager::new(redis_url)
+                    .expect("Failed to connect to redis");
+
+                let redis_pool = bb8_redis::bb8::Pool::builder()
+                    .max_size(redis_connections)
+                    .connection_timeout(std::time::Duration::from_secs(2))
+                    .build(redis_manager)
+                    .await
+                    .expect("Failed to create redis pool");
+
+                let web_redis_pool = actix_web::web::Data::new(redis_pool);
+
+                let args = SetupArgs {
+                    url: Some(get_env!("CLICKHOUSE_URL", "CLICKHOUSE_URL is not set").to_string()),
+                    user: Some(
+                        get_env!("CLICKHOUSE_USER", "CLICKHOUSE_USER is not set").to_string(),
+                    ),
+                    password: Some(
+                        get_env!("CLICKHOUSE_PASSWORD", "CLICKHOUSE_PASSWORD is not set")
+                            .to_string(),
+                    ),
+                    database: Some(
+                        get_env!("CLICKHOUSE_DB", "CLICKHOUSE_DB is not set").to_string(),
+                    ),
+                };
+
+                let clickhouse_client = clickhouse::Client::default()
+                    .with_url(args.url.as_ref().unwrap())
+                    .with_user(args.user.as_ref().unwrap())
+                    .with_password(args.password.as_ref().unwrap())
+                    .with_database(args.database.as_ref().unwrap())
+                    .with_option("async_insert", "1")
+                    .with_option("wait_for_async_insert", "0");
+
+                let should_terminate = Arc::new(AtomicBool::new(false));
+                signal_hook::flag::register(SIGTERM, Arc::clone(&should_terminate))
+                    .expect("Failed to register shutdown hook");
+
+                bktree_worker(should_terminate, web_redis_pool, clickhouse_client).await
+            }
+            .bind_hub(Hub::new_from_top(Hub::current())),
+        );
+}
+
+#[allow(clippy::print_stdout)]
+async fn bktree_worker(
+    should_terminate: Arc<AtomicBool>,
+    redis_pool: actix_web::web::Data<RedisPool>,
+    clickhouse_client: clickhouse::Client,
+) {
+    log::info!("Starting bk tree service thread");
+
+    let mut redis_conn_sleep = std::time::Duration::from_secs(1);
+
+    #[allow(unused_assignments)]
+    let mut opt_redis_connection = None;
+
+    loop {
+        let borrowed_redis_connection = match redis_pool.get().await {
+            Ok(redis_connection) => Some(redis_connection),
+            Err(err) => {
+                log::error!("Failed to get redis connection outside of loop: {:?}", err);
+                None
+            }
+        };
+
+        if borrowed_redis_connection.is_some() {
+            opt_redis_connection = borrowed_redis_connection;
+            break;
+        }
+
+        tokio::time::sleep(redis_conn_sleep).await;
+        redis_conn_sleep = std::cmp::min(redis_conn_sleep * 2, std::time::Duration::from_secs(300));
+    }
+
+    let mut redis_connection =
+        opt_redis_connection.expect("Failed to get redis connection outside of loop");
+
+    let mut broken_pipe_sleep = std::time::Duration::from_secs(10);
+
+    loop {
+        if should_terminate.load(Ordering::Relaxed) {
+            log::info!("Shutting down");
+            break;
+        }
+
+        let payload_result: Result<Vec<String>, redis::RedisError> = redis::cmd("SPOP")
+            .arg("bktree_creation")
+            .query_async(&mut *redis_connection)
+            .await;
+
+        let serialized_message = match payload_result {
+            Ok(payload) => {
+                broken_pipe_sleep = std::time::Duration::from_secs(10);
+
+                if payload.is_empty() {
+                    continue;
+                }
+                let _: Result<i32, redis::RedisError> = redis::cmd("SADD")
+                    .arg("bktree_processing")
+                    .query_async(&mut *redis_connection)
+                    .await;
+
+                payload
+                    .first()
+                    .expect("Payload must have a first element")
+                    .clone()
+            }
+            Err(err) => {
+                log::error!("Unable to process {:?}", err);
+
+                if err.is_io_error() {
+                    tokio::time::sleep(broken_pipe_sleep).await;
+                    broken_pipe_sleep =
+                        std::cmp::min(broken_pipe_sleep * 2, std::time::Duration::from_secs(300));
+                }
+
+                continue;
+            }
+        };
+
+        let create_tree_msg: CreateBkTreeMessage = match serde_json::from_str(&serialized_message) {
+            Ok(message) => message,
+            Err(err) => {
+                log::error!(
+                    "Failed to deserialize message, was not a CreateBkTreeMessage: {:?}",
+                    err
+                );
+                continue;
+            }
+        };
+
+        let mut id_offset = uuid::Uuid::nil();
+        log::info!("Processing dataset {}", create_tree_msg.dataset_id);
+
+        let mut bk_tree = if let Ok(Some(bktree)) =
+            BkTree::from_redis(create_tree_msg.dataset_id, redis_pool.clone()).await
+        {
+            bktree
+        } else {
+            BkTree::new()
+        };
+
+        let mut failed = false;
+
+        let last_processed =
+            get_last_processed_from_clickhouse(&clickhouse_client, create_tree_msg.dataset_id)
+                .await;
+
+        let last_processed = match last_processed {
+            Ok(last_processed) => last_processed.map(|lp| lp.last_processed),
+            Err(err) => {
+                let _ = readd_error_to_queue(create_tree_msg.clone(), &err, redis_pool.clone())
+                    .await
+                    .map_err(|e| {
+                        eprintln!("Failed to readd error to queue: {:?}", e);
+                    });
+                continue;
+            }
+        };
+
+        while let Ok(Some(word_and_counts)) = scroll_words_from_dataset(
+            create_tree_msg.dataset_id,
+            id_offset,
+            last_processed,
+            5000,
+            &clickhouse_client,
+        )
+        .await
+        .map_err(|err| {
+            let err = err.clone();
+            let redis_pool = redis_pool.clone();
+            let create_tree_msg = create_tree_msg.clone();
+            tokio::spawn(async move {
+                let _ = readd_error_to_queue(create_tree_msg.clone(), &err, redis_pool.clone())
+                    .await
+                    .map_err(|e| {
+                        eprintln!("Failed to readd error to queue: {:?}", e);
+                    });
+            });
+            failed = true;
+        }) {
+            dbg!(id_offset);
+            if let Some(last_word) = word_and_counts.last() {
+                id_offset = last_word.id;
+            }
+
+            let word_and_counts = word_and_counts
+                .into_iter()
+                .map(|words| (words.word, words.count))
+                .collect::<Vec<(String, i32)>>();
+
+            bk_tree.insert_all(word_and_counts);
+        }
+
+        if failed {
+            continue;
+        }
+
+        match bk_tree
+            .save(create_tree_msg.dataset_id, redis_pool.clone())
+            .await
+        {
+            Ok(()) => {
+                let _ = redis::cmd("LREM")
+                    .arg("bktree_processing")
+                    .arg(1)
+                    .arg(serialized_message.clone())
+                    .query_async::<redis::aio::MultiplexedConnection, usize>(&mut *redis_connection)
+                    .await;
+
+                log::info!(
+                    "Succesfully created bk-tree for {}",
+                    create_tree_msg.dataset_id
+                );
+            }
+            Err(err) => {
+                let _ = readd_error_to_queue(
+                    create_tree_msg.clone(),
+                    &ServiceError::InternalServerError(format!(
+                        "Failed to serialize tree: {:?}",
+                        err
+                    )),
+                    redis_pool.clone(),
+                )
+                .await;
+            }
+        }
+
+        match update_dataset_last_processed_query(create_tree_msg.dataset_id, &clickhouse_client)
+            .await
+        {
+            Ok(_) => {}
+            Err(err) => {
+                log::error!("Failed to update last processed {:?}", err);
+            }
+        }
+        let sleep_duration = rand::thread_rng().gen_range(1..=10);
+        tokio::time::sleep(std::time::Duration::from_secs(sleep_duration)).await;
+    }
+}
+
+pub async fn readd_error_to_queue(
+    message: CreateBkTreeMessage,
+    error: &ServiceError,
+    redis_pool: actix_web::web::Data<RedisPool>,
+) -> Result<(), ServiceError> {
+    let mut message = message;
+
+    let old_payload_message = serde_json::to_string(&message).map_err(|_| {
+        ServiceError::InternalServerError("Failed to reserialize input for retry".to_string())
+    })?;
+
+    let mut redis_conn = redis_pool
+        .get()
+        .await
+        .map_err(|err| ServiceError::BadRequest(err.to_string()))?;
+
+    let _ = redis::cmd("SREM")
+        .arg("bktree_processing")
+        .arg(1)
+        .arg(old_payload_message.clone())
+        .query_async::<redis::aio::MultiplexedConnection, usize>(&mut *redis_conn)
+        .await;
+
+    message.attempt_number += 1;
+
+    if message.attempt_number == 3 {
+        log::error!("Failed to construct bktree 3 times {:?}", error);
+        let mut redis_conn = redis_pool
+            .get()
+            .await
+            .map_err(|err| ServiceError::BadRequest(err.to_string()))?;
+
+        redis::cmd("SADD")
+            .arg("bktree_dead_letters")
+            .arg(old_payload_message)
+            .query_async(&mut *redis_conn)
+            .await
+            .map_err(|err| ServiceError::BadRequest(err.to_string()))?;
+
+        return Err(ServiceError::InternalServerError(format!(
+            "Failed to construct bktree {:?}",
+            error
+        )));
+    } else {
+        let new_payload_message = serde_json::to_string(&message).map_err(|_| {
+            ServiceError::InternalServerError("Failed to reserialize input for retry".to_string())
+        })?;
+
+        let mut redis_conn = redis_pool
+            .get()
+            .await
+            .map_err(|err| ServiceError::BadRequest(err.to_string()))?;
+
+        log::error!(
+            "Failed to insert data, re-adding {:?} retry: {:?}",
+            error,
+            message.attempt_number
+        );
+
+        redis::cmd("SADD")
+            .arg("bktree_creation")
+            .arg(&new_payload_message)
+            .query_async(&mut *redis_conn)
+            .await
+            .map_err(|err| ServiceError::BadRequest(err.to_string()))?
+    }
+
+    Ok(())
+}
diff --git a/server/src/bin/file-worker.rs b/server/src/bin/file-worker.rs
index 39da0a253b..20274e633c 100644
--- a/server/src/bin/file-worker.rs
+++ b/server/src/bin/file-worker.rs
@@ -232,6 +232,19 @@ async fn file_worker(
             Ok(Some(file_id)) => {
                 log::info!("Uploaded file: {:?}", file_id);
 
+                event_queue
+                    .send(ClickHouseEvent::WorkerEvent(
+                        models::WorkerEvent::from_details(
+                            file_worker_message.dataset_id,
+                            models::EventType::FileUploaded {
+                                file_id,
+                                file_name: file_worker_message.upload_file_data.file_name.clone(),
+                            },
+                        )
+                        .into(),
+                    ))
+                    .await;
+
                 let _ = redis::cmd("LREM")
                     .arg("file_processing")
                     .arg(1)
diff --git a/server/src/bin/word-id-cronjob.rs b/server/src/bin/word-id-cronjob.rs
new file mode 100644
index 0000000000..6b1f09861f
--- /dev/null
+++ b/server/src/bin/word-id-cronjob.rs
@@ -0,0 +1,191 @@
+use chm::tools::migrations::SetupArgs;
+use diesel_async::pooled_connection::{AsyncDieselConnectionManager, ManagerConfig};
+use futures::future::join_all;
+use itertools::Itertools;
+use tracing_subscriber::{prelude::*, EnvFilter, Layer};
+use trieve_server::{
+    errors::ServiceError,
+    establish_connection, get_env,
+    operators::{
+        chunk_operator::{
+            get_last_processed_from_clickhouse, scroll_chunk_ids_for_dictionary_query,
+        },
+        dataset_operator::get_all_dataset_ids,
+        words_operator::ProcessWordsFromDatasetMessage,
+    },
+};
+
+#[allow(clippy::print_stdout)]
+#[tokio::main]
+async fn main() -> Result<(), ServiceError> {
+    dotenvy::dotenv().ok();
+    log::info!("Starting id worker service thread");
+    let sentry_url = std::env::var("SENTRY_URL");
+    let _guard = if let Ok(sentry_url) = sentry_url {
+        let guard = sentry::init((
+            sentry_url,
+            sentry::ClientOptions {
+                release: sentry::release_name!(),
+                traces_sample_rate: 1.0,
+                ..Default::default()
+            },
+        ));
+
+        tracing_subscriber::Registry::default()
+            .with(sentry::integrations::tracing::layer())
+            .with(
+                tracing_subscriber::fmt::layer().with_filter(
+                    EnvFilter::from_default_env()
+                        .add_directive(tracing_subscriber::filter::LevelFilter::INFO.into()),
+                ),
+            )
+            .init();
+
+        log::info!("Sentry monitoring enabled");
+        Some(guard)
+    } else {
+        tracing_subscriber::Registry::default()
+            .with(
+                tracing_subscriber::fmt::layer().with_filter(
+                    EnvFilter::from_default_env()
+                        .add_directive(tracing_subscriber::filter::LevelFilter::INFO.into()),
+                ),
+            )
+            .init();
+
+        None
+    };
+
+    let redis_url = get_env!("REDIS_URL", "REDIS_URL is not set");
+    let redis_connections: u32 = std::env::var("REDIS_CONNECTIONS")
+        .unwrap_or("2".to_string())
+        .parse()
+        .unwrap_or(2);
+
+    let redis_manager =
+        bb8_redis::RedisConnectionManager::new(redis_url).expect("Failed to connect to redis");
+
+    let redis_pool = bb8_redis::bb8::Pool::builder()
+        .max_size(redis_connections)
+        .connection_timeout(std::time::Duration::from_secs(2))
+        .build(redis_manager)
+        .await
+        .expect("Failed to create redis pool");
+
+    let database_url = get_env!("DATABASE_URL", "DATABASE_URL is not set");
+
+    let mut config = ManagerConfig::default();
+    config.custom_setup = Box::new(establish_connection);
+
+    let mgr = AsyncDieselConnectionManager::<diesel_async::AsyncPgConnection>::new_with_config(
+        database_url,
+        config,
+    );
+
+    let pool = diesel_async::pooled_connection::deadpool::Pool::builder(mgr)
+        .max_size(3)
+        .build()
+        .expect("Failed to create diesel_async pool");
+
+    let pool = actix_web::web::Data::new(pool.clone());
+
+    let args = SetupArgs {
+        url: Some(get_env!("CLICKHOUSE_URL", "CLICKHOUSE_URL is not set").to_string()),
+        user: Some(get_env!("CLICKHOUSE_USER", "CLICKHOUSE_USER is not set").to_string()),
+        password: Some(
+            get_env!("CLICKHOUSE_PASSWORD", "CLICKHOUSE_PASSWORD is not set").to_string(),
+        ),
+        database: Some(get_env!("CLICKHOUSE_DB", "CLICKHOUSE_DB is not set").to_string()),
+    };
+
+    let clickhouse_client = clickhouse::Client::default()
+        .with_url(args.url.as_ref().unwrap())
+        .with_user(args.user.as_ref().unwrap())
+        .with_password(args.password.as_ref().unwrap())
+        .with_database(args.database.as_ref().unwrap())
+        .with_option("async_insert", "1")
+        .with_option("wait_for_async_insert", "0");
+
+    let dataset_ids = get_all_dataset_ids(pool.clone()).await?;
+    let dataset_ids_and_processed = dataset_ids
+        .into_iter()
+        .map(|dataset_id| {
+            let clickhouse_client = clickhouse_client.clone();
+            async move {
+                (
+                    dataset_id,
+                    get_last_processed_from_clickhouse(&clickhouse_client, dataset_id).await,
+                )
+            }
+        })
+        .collect_vec();
+
+    let dataset_ids_and_processed = join_all(dataset_ids_and_processed).await;
+
+    for (dataset_id, last_processed) in dataset_ids_and_processed {
+        let mut chunk_id_offset = uuid::Uuid::nil();
+
+        let last_processed = last_processed.map_err(|_| {
+            ServiceError::InternalServerError(
+                "Failed to get last processed from clickhouse".to_string(),
+            )
+        })?;
+        while let Some(chunk_id_dataset_id_list) = scroll_chunk_ids_for_dictionary_query(
+            pool.clone(),
+            dataset_id,
+            last_processed.clone(),
+            10000,
+            chunk_id_offset,
+        )
+        .await?
+        {
+            if let Some((chunk_id, _)) = chunk_id_dataset_id_list.last() {
+                chunk_id_offset = *chunk_id
+            }
+            let redis_futures =
+                chunk_id_dataset_id_list
+                    .chunks(500)
+                    .map(|chunk_id_dataset_id_list| {
+                        let pool = redis_pool.clone();
+                        async move {
+                            let mut redis_conn = pool
+                                .get()
+                                .await
+                                .map_err(|err| ServiceError::BadRequest(err.to_string()))?;
+                            let process_words_msg = ProcessWordsFromDatasetMessage {
+                                chunks_to_process: chunk_id_dataset_id_list.to_vec(),
+                                attempt_number: 0,
+                            };
+
+                            match serde_json::to_string(&process_words_msg).map_err(|_| {
+                                ServiceError::InternalServerError(
+                                    "Failed to serialize message".to_string(),
+                                )
+                            }) {
+                                Ok(serialized_msg) => redis::cmd("LPUSH")
+                                    .arg("create_dictionary")
+                                    .arg(serialized_msg)
+                                    .query_async::<redis::aio::MultiplexedConnection, bool>(
+                                        &mut *redis_conn,
+                                    )
+                                    .await
+                                    .map_err(|_| {
+                                        ServiceError::InternalServerError(
+                                            "Failed to send message to redis".to_string(),
+                                        )
+                                    }),
+                                Err(err) => Err(err),
+                            }
+                        }
+                    });
+
+            let _ = join_all(redis_futures)
+                .await
+                .into_iter()
+                .collect::<Result<Vec<bool>, ServiceError>>()?;
+            log::info!("Scrolled {} chunks", chunk_id_dataset_id_list.len());
+        }
+    }
+
+    Ok(())
+}
diff --git a/server/src/bin/word-worker.rs b/server/src/bin/word-worker.rs
new file mode 100644
index 0000000000..157ad0dc5a
--- /dev/null
+++ b/server/src/bin/word-worker.rs
@@ -0,0 +1,417 @@
+#![allow(clippy::print_stdout)]
+use actix_web::web;
+use chm::tools::migrations::SetupArgs;
+use diesel_async::pooled_connection::{AsyncDieselConnectionManager, ManagerConfig};
+use futures::future::join_all;
+use itertools::Itertools;
+use sentry::{Hub, SentryFutureExt};
+use signal_hook::consts::SIGTERM;
+use std::{
+    collections::HashMap,
+    sync::{
+        atomic::{AtomicBool, Ordering},
+        Arc,
+    },
+};
+use tracing_subscriber::{prelude::*, EnvFilter, Layer};
+use trieve_server::{
+    data::models,
+    errors::ServiceError,
+    establish_connection, get_env,
+    operators::{
+        chunk_operator::get_chunk_html_from_ids_query,
+        dataset_operator::add_words_to_dataset,
+        parse_operator::convert_html_to_text,
+        words_operator::{CreateBkTreeMessage, ProcessWordsFromDatasetMessage},
+    },
+};
+
+#[allow(clippy::print_stdout)]
+fn main() -> Result<(), ServiceError> {
+    dotenvy::dotenv().ok();
+    let sentry_url = std::env::var("SENTRY_URL");
+    let _guard = if let Ok(sentry_url) = sentry_url {
+        let guard = sentry::init((
+            sentry_url,
+            sentry::ClientOptions {
+                release: sentry::release_name!(),
+                traces_sample_rate: 1.0,
+                ..Default::default()
+            },
+        ));
+
+        tracing_subscriber::Registry::default()
+            .with(sentry::integrations::tracing::layer())
+            .with(
+                tracing_subscriber::fmt::layer().with_filter(
+                    EnvFilter::from_default_env()
+                        .add_directive(tracing_subscriber::filter::LevelFilter::INFO.into()),
+                ),
+            )
+            .init();
+
+        log::info!("Sentry monitoring enabled");
+        Some(guard)
+    } else {
+        tracing_subscriber::Registry::default()
+            .with(
+                tracing_subscriber::fmt::layer().with_filter(
+                    EnvFilter::from_default_env()
+                        .add_directive(tracing_subscriber::filter::LevelFilter::INFO.into()),
+                ),
+            )
+            .init();
+
+        None
+    };
+
+    let database_url = get_env!("DATABASE_URL", "DATABASE_URL is not set");
+
+    let mut config = ManagerConfig::default();
+    config.custom_setup = Box::new(establish_connection);
+
+    let mgr = AsyncDieselConnectionManager::<diesel_async::AsyncPgConnection>::new_with_config(
+        database_url,
+        config,
+    );
+
+    let pool = diesel_async::pooled_connection::deadpool::Pool::builder(mgr)
+        .max_size(3)
+        .build()
+        .expect("Failed to create diesel_async pool");
+
+    let web_pool = actix_web::web::Data::new(pool.clone());
+    tokio::runtime::Builder::new_multi_thread()
+        .enable_all()
+        .build()
+        .expect("Failed to create tokio runtime")
+        .block_on(
+            async move {
+                let redis_url = get_env!("REDIS_URL", "REDIS_URL is not set");
+
+                let redis_manager = bb8_redis::RedisConnectionManager::new(redis_url)
+                    .expect("Failed to connect to redis");
+
+                let redis_pool = bb8_redis::bb8::Pool::builder()
+                    .connection_timeout(std::time::Duration::from_secs(2))
+                    .build(redis_manager)
+                    .await
+                    .expect("Failed to create redis pool");
+
+                let web_redis_pool = actix_web::web::Data::new(redis_pool);
+
+                let args = SetupArgs {
+                    url: Some(get_env!("CLICKHOUSE_URL", "CLICKHOUSE_URL is not set").to_string()),
+                    user: Some(
+                        get_env!("CLICKHOUSE_USER", "CLICKHOUSE_USER is not set").to_string(),
+                    ),
+                    password: Some(
+                        get_env!("CLICKHOUSE_PASSWORD", "CLICKHOUSE_PASSWORD is not set")
+                            .to_string(),
+                    ),
+                    database: Some(
+                        get_env!("CLICKHOUSE_DB", "CLICKHOUSE_DB is not set").to_string(),
+                    ),
+                };
+
+                let clickhouse_client = clickhouse::Client::default()
+                    .with_url(args.url.as_ref().unwrap())
+                    .with_user(args.user.as_ref().unwrap())
+                    .with_password(args.password.as_ref().unwrap())
+                    .with_database(args.database.as_ref().unwrap())
+                    .with_option("async_insert", "1")
+                    .with_option("wait_for_async_insert", "0");
+
+                let should_terminate = Arc::new(AtomicBool::new(false));
+                signal_hook::flag::register(SIGTERM, Arc::clone(&should_terminate))
+                    .expect("Failed to register shutdown hook");
+                word_worker(
+                    should_terminate,
+                    web_redis_pool,
+                    web_pool,
+                    clickhouse_client,
+                )
+                .await
+            }
+            .bind_hub(Hub::new_from_top(Hub::current())),
+        );
+
+    Ok(())
+}
+
+async fn word_worker(
+    should_terminate: Arc<AtomicBool>,
+    redis_pool: actix_web::web::Data<models::RedisPool>,
+    web_pool: actix_web::web::Data<models::Pool>,
+    clickhouse_client: clickhouse::Client,
+) {
+    log::info!("Starting word worker service thread");
+    let mut redis_conn_sleep = std::time::Duration::from_secs(1);
+
+    #[allow(unused_assignments)]
+    let mut opt_redis_connection = None;
+
+    loop {
+        let borrowed_redis_connection = match redis_pool.get().await {
+            Ok(redis_connection) => Some(redis_connection),
+            Err(err) => {
+                log::error!("Failed to get redis connection outside of loop: {:?}", err);
+                None
+            }
+        };
+
+        if borrowed_redis_connection.is_some() {
+            opt_redis_connection = borrowed_redis_connection;
+            break;
+        }
+
+        tokio::time::sleep(redis_conn_sleep).await;
+        redis_conn_sleep = std::cmp::min(redis_conn_sleep * 2, std::time::Duration::from_secs(300));
+    }
+
+    let mut redis_connection =
+        opt_redis_connection.expect("Failed to get redis connection outside of loop");
+
+    let mut broken_pipe_sleep = std::time::Duration::from_secs(10);
+
+    loop {
+        if should_terminate.load(Ordering::Relaxed) {
+            log::info!("Shutting down");
+            break;
+        }
+
+        let payload_result: Result<Vec<String>, redis::RedisError> = redis::cmd("brpoplpush")
+            .arg("create_dictionary")
+            .arg("process_dictionary")
+            .arg(1.0)
+            .query_async(&mut *redis_connection)
+            .await;
+
+        let serialized_msg = match payload_result {
+            Ok(payload) => {
+                broken_pipe_sleep = std::time::Duration::from_secs(10);
+
+                if payload.is_empty() {
+                    continue;
+                }
+
+                payload
+                    .first()
+                    .expect("Payload must have a first element")
+                    .clone()
+            }
+            Err(err) => {
+                log::error!("Unable to process {:?}", err);
+
+                if err.is_io_error() {
+                    tokio::time::sleep(broken_pipe_sleep).await;
+                    broken_pipe_sleep =
+                        std::cmp::min(broken_pipe_sleep * 2, std::time::Duration::from_secs(300));
+                }
+
+                continue;
+            }
+        };
+
+        let msg: ProcessWordsFromDatasetMessage = match serde_json::from_str(&serialized_msg) {
+            Ok(message) => message,
+            Err(err) => {
+                log::error!(
+                    "Failed to deserialize message, was not an IngestionMessage: {:?}",
+                    err
+                );
+                continue;
+            }
+        };
+
+        match process_chunks(
+            msg.clone(),
+            web_pool.clone(),
+            redis_pool.clone(),
+            clickhouse_client.clone(),
+        )
+        .await
+        {
+            Ok(()) => {
+                log::info!("Processing {} chunks", msg.chunks_to_process.len());
+            }
+            Err(err) => {
+                log::error!("Failed to process dataset: {:?}", err);
+                let _ = readd_error_to_queue(msg.clone(), err, redis_pool.clone()).await;
+            }
+        }
+    }
+}
+
+async fn process_chunks(
+    message: ProcessWordsFromDatasetMessage,
+    pool: web::Data<models::Pool>,
+    redis_pool: web::Data<models::RedisPool>,
+    clickhouse_client: clickhouse::Client,
+) -> Result<(), ServiceError> {
+    let mut word_count_map: HashMap<(uuid::Uuid, String), i32> = HashMap::new();
+    if let Some(chunks) = get_chunk_html_from_ids_query(
+        message
+            .chunks_to_process
+            .clone()
+            .into_iter()
+            .map(|x| x.0)
+            .collect(),
+        pool.clone(),
+    )
+    .await?
+    {
+        let chunks = chunks
+            .into_iter()
+            // add dataset_id back to chunks
+            .zip(message.chunks_to_process.clone().into_iter().map(|x| x.1))
+            .collect_vec();
+
+        for ((_, chunk), dataset_id) in &chunks {
+            let content = convert_html_to_text(chunk);
+            for word in content
+                .split([' ', '\n', '\t', '\r', ',', '.', ';', ':', '!', '?'].as_ref())
+                .filter(|word| !word.is_empty())
+            {
+                let word = word
+                    .replace(|c: char| !c.is_alphabetic(), "")
+                    .to_lowercase()
+                    .chars()
+                    .take(50)
+                    .join("");
+                if let Some(count) = word_count_map.get_mut(&(*dataset_id, word.clone())) {
+                    *count += 1;
+                } else {
+                    word_count_map.insert((*dataset_id, word), 1);
+                }
+            }
+        }
+    }
+
+    let (dataset_id_word, counts): (Vec<_>, Vec<_>) = word_count_map
+        .into_iter()
+        .sorted_by_key(|((_, word), _)| word.clone())
+        .unzip();
+
+    let words_and_counts = dataset_id_word
+        .into_iter()
+        .zip(counts.into_iter())
+        .dedup_by(|((_, word1), _), ((_, word2), _)| word1 == word2)
+        .collect_vec();
+
+    let word_dataset_relation_futs = words_and_counts
+        .chunks(5000)
+        .map(|ids_counts| {
+            let words = ids_counts.iter().map(|((_, w), _)| w.clone()).collect_vec();
+            let dataset_ids = ids_counts
+                .iter()
+                .map(|((d, _), _)| d.to_owned())
+                .collect_vec();
+            let counts = ids_counts
+                .iter()
+                .map(|((_, _), c)| c.to_owned())
+                .collect_vec();
+            add_words_to_dataset(words, counts, dataset_ids, &clickhouse_client)
+        })
+        .collect_vec();
+
+    join_all(word_dataset_relation_futs)
+        .await
+        .into_iter()
+        .collect::<Result<Vec<()>, ServiceError>>()?;
+
+    let serialized_payload = serde_json::to_string(&message).map_err(|_| {
+        ServiceError::InternalServerError("Failed to reserialize input".to_string())
+    })?;
+
+    let mut redis_conn = redis_pool
+        .get()
+        .await
+        .map_err(|err| ServiceError::BadRequest(err.to_string()))?;
+
+    let _ = redis::cmd("LREM")
+        .arg("process_dictionary")
+        .arg(1)
+        .arg(serialized_payload)
+        .query_async::<redis::aio::MultiplexedConnection, usize>(&mut *redis_conn)
+        .await;
+
+    let create_tree_msgs = words_and_counts
+        .iter()
+        .map(|((dataset_id, _), _)| *dataset_id)
+        .unique()
+        .map(|id| {
+            let msg = CreateBkTreeMessage {
+                dataset_id: id,
+                attempt_number: 0,
+            };
+
+            serde_json::to_string(&msg).map_err(|_| {
+                ServiceError::InternalServerError("Failed to serialize message".to_string())
+            })
+        })
+        .collect::<Result<Vec<String>, ServiceError>>()?;
+
+    redis::cmd("SADD")
+        .arg("bktree_creation")
+        .arg(create_tree_msgs)
+        .query_async::<redis::aio::MultiplexedConnection, usize>(&mut *redis_conn)
+        .await
+        .map_err(|_| {
+            ServiceError::InternalServerError("Failed to send message to redis".to_string())
+        })?;
+
+    Ok(())
+}
+
+#[tracing::instrument(skip(redis_pool))]
+pub async fn readd_error_to_queue(
+    mut message: ProcessWordsFromDatasetMessage,
+    error: ServiceError,
+    redis_pool: actix_web::web::Data<models::RedisPool>,
+) -> Result<(), ServiceError> {
+    let old_payload_message = serde_json::to_string(&message).map_err(|_| {
+        ServiceError::InternalServerError("Failed to reserialize input for retry".to_string())
+    })?;
+
+    let mut redis_conn = redis_pool
+        .get()
+        .await
+        .map_err(|err| ServiceError::BadRequest(err.to_string()))?;
+
+    let _ = redis::cmd("lrem")
+        .arg("process_dictionary")
+        .arg(1)
+        .arg(old_payload_message.clone())
+        .query_async::<redis::aio::MultiplexedConnection, usize>(&mut *redis_conn)
+        .await;
+
+    message.attempt_number += 1;
+
+    if message.attempt_number == 3 {
+        log::error!("Failed to process dataset 3 times: {:?}", error);
+        redis::cmd("lpush")
+            .arg("dictionary_dead_letters")
+            .arg(old_payload_message)
+            .query_async(&mut *redis_conn)
+            .await
+            .map_err(|err| ServiceError::BadRequest(err.to_string()))?;
+        return Err(ServiceError::InternalServerError(format!(
+            "Failed to create new qdrant point: {:?}",
+            error
+        )));
+    }
+
+    let new_payload_message = serde_json::to_string(&message).map_err(|_| {
+        ServiceError::InternalServerError("Failed to reserialize input for retry".to_string())
+    })?;
+
+    redis::cmd("lpush")
+        .arg("create_dictionary")
+        .arg(&new_payload_message)
+        .query_async(&mut *redis_conn)
+        .await
+        .map_err(|err| ServiceError::BadRequest(err.to_string()))?;
+
+    Ok(())
+}
diff --git a/server/src/data/models.rs b/server/src/data/models.rs
index 91c706bfdc..702f8af670 100644
--- a/server/src/data/models.rs
+++ b/server/src/data/models.rs
@@ -4,7 +4,8 @@ use super::schema::*;
 use crate::errors::ServiceError;
 use crate::get_env;
 use crate::handlers::chunk_handler::{
-    AutocompleteReqPayload, ChunkFilter, FullTextBoost, SearchChunksReqPayload, SemanticBoost,
+    AutocompleteReqPayload, ChunkFilter, FullTextBoost, ParsedQuery, SearchChunksReqPayload,
+    SemanticBoost,
 };
 use crate::handlers::file_handler::UploadFileReqPayload;
 use crate::handlers::group_handler::{SearchOverGroupsReqPayload, SearchWithinGroupReqPayload};
@@ -1871,9 +1872,9 @@ pub struct Dataset {
     pub created_at: chrono::NaiveDateTime,
     pub updated_at: chrono::NaiveDateTime,
     pub organization_id: uuid::Uuid,
+    pub server_configuration: serde_json::Value,
     pub tracking_id: Option<String>,
     pub deleted: i32,
-    pub server_configuration: serde_json::Value,
 }
 
 impl Dataset {
@@ -5019,6 +5020,28 @@ pub struct HighlightOptions {
     pub highlight_window: Option<u32>,
 }
 
+#[derive(Serialize, Deserialize, Debug, Clone, ToSchema, Default)]
+/// Typo Options lets you specify different methods to correct typos in the query. If not specified, typos will not be corrected.
+pub struct TypoOptions {
+    /// Set correct_typos to true to correct typos in the query. If not specified, this defaults to false.
+    pub correct_typos: Option<bool>,
+    /// The range of which the query will be corrected if it has one typo. If not specified, this defaults to 5-8.
+    pub one_typo_word_range: Option<TypoRange>,
+    /// The range of which the query will be corrected if it has two typos. If not specified, this defaults to 8-inf.
+    pub two_typo_word_range: Option<TypoRange>,
+    /// Words that should not be corrected. If not specified, this defaults to an empty list.
+    pub disable_on_word: Option<Vec<String>>,
+}
+
+#[derive(Serialize, Deserialize, Debug, Clone, ToSchema, Default)]
+/// The TypoRange struct is used to specify the range of which the query will be corrected if it has a typo.
+pub struct TypoRange {
+    /// The minimum number of characters that the query will be corrected if it has a typo. If not specified, this defaults to 5.
+    pub min: u32,
+    /// The maximum number of characters that the query will be corrected if it has a typo. If not specified, this defaults to 8.
+    pub max: Option<u32>,
+}
+
 #[derive(Debug, Serialize, Deserialize, ToSchema, Clone, Default)]
 /// LLM options to use for the completion. If not specified, this defaults to the dataset's LLM options.
 pub struct LLMOptions {
@@ -5174,6 +5197,7 @@ impl<'de> Deserialize<'de> for SearchChunksReqPayload {
             use_quote_negated_terms: Option<bool>,
             remove_stop_words: Option<bool>,
             user_id: Option<String>,
+            typo_options: Option<TypoOptions>,
             #[serde(flatten)]
             other: std::collections::HashMap<String, serde_json::Value>,
         }
@@ -5204,6 +5228,7 @@ impl<'de> Deserialize<'de> for SearchChunksReqPayload {
             use_quote_negated_terms: helper.use_quote_negated_terms,
             remove_stop_words: helper.remove_stop_words,
             user_id: helper.user_id,
+            typo_options: helper.typo_options,
         })
     }
 }
@@ -5228,6 +5253,7 @@ impl<'de> Deserialize<'de> for AutocompleteReqPayload {
             use_quote_negated_terms: Option<bool>,
             remove_stop_words: Option<bool>,
             user_id: Option<String>,
+            typo_options: Option<TypoOptions>,
             #[serde(flatten)]
             other: std::collections::HashMap<String, serde_json::Value>,
         }
@@ -5257,6 +5283,7 @@ impl<'de> Deserialize<'de> for AutocompleteReqPayload {
             use_quote_negated_terms: helper.use_quote_negated_terms,
             remove_stop_words: helper.remove_stop_words,
             user_id: helper.user_id,
+            typo_options: helper.typo_options,
         })
     }
 }
@@ -5284,6 +5311,7 @@ impl<'de> Deserialize<'de> for SearchWithinGroupReqPayload {
             use_quote_negated_terms: Option<bool>,
             remove_stop_words: Option<bool>,
             user_id: Option<String>,
+            typo_options: Option<TypoOptions>,
             #[serde(flatten)]
             other: std::collections::HashMap<String, serde_json::Value>,
         }
@@ -5316,6 +5344,7 @@ impl<'de> Deserialize<'de> for SearchWithinGroupReqPayload {
             use_quote_negated_terms: helper.use_quote_negated_terms,
             remove_stop_words: helper.remove_stop_words,
             user_id: helper.user_id,
+            typo_options: helper.typo_options,
         })
     }
 }
@@ -5340,6 +5369,7 @@ impl<'de> Deserialize<'de> for SearchOverGroupsReqPayload {
             use_quote_negated_terms: Option<bool>,
             remove_stop_words: Option<bool>,
             user_id: Option<String>,
+            typo_options: Option<TypoOptions>,
             #[serde(flatten)]
             other: std::collections::HashMap<String, serde_json::Value>,
         }
@@ -5365,6 +5395,7 @@ impl<'de> Deserialize<'de> for SearchOverGroupsReqPayload {
             score_threshold: helper.score_threshold,
             slim_chunks: helper.slim_chunks,
             use_quote_negated_terms: helper.use_quote_negated_terms,
+            typo_options: helper.typo_options,
             remove_stop_words: helper.remove_stop_words,
             user_id: helper.user_id,
         })
@@ -5527,6 +5558,15 @@ pub struct MultiQuery {
     pub weight: f32,
 }
 
+impl From<(ParsedQuery, f32)> for MultiQuery {
+    fn from((query, weight): (ParsedQuery, f32)) -> Self {
+        Self {
+            query: query.query,
+            weight,
+        }
+    }
+}
+
 #[derive(Debug, Serialize, Deserialize, ToSchema, Clone, PartialEq)]
 #[serde(untagged)]
 /// Query is the search query. This can be any string. The query will be used to create an embedding vector and/or SPLADE vector which will be used to find the result set.  You can either provide one query, or multiple with weights. Multi-query only works with Semantic Search and is not compatible with cross encoder re-ranking or highlights.
@@ -5545,3 +5585,27 @@ impl QueryTypes {
         }
     }
 }
+
+#[derive(Debug, Serialize, Deserialize, Row, Clone, ToSchema)]
+pub struct WordDataset {
+    #[serde(with = "clickhouse::serde::uuid")]
+    pub id: uuid::Uuid,
+    #[serde(with = "clickhouse::serde::uuid")]
+    pub dataset_id: uuid::Uuid,
+    pub word: String,
+    pub count: i32,
+    #[serde(with = "clickhouse::serde::time::datetime")]
+    pub created_at: OffsetDateTime,
+}
+
+impl WordDataset {
+    pub fn from_details(word: String, dataset_id: uuid::Uuid, count: i32) -> Self {
+        Self {
+            id: uuid::Uuid::new_v4(),
+            word,
+            dataset_id,
+            count,
+            created_at: OffsetDateTime::now_utc(),
+        }
+    }
+}
diff --git a/server/src/data/schema.rs b/server/src/data/schema.rs
index 3921eba22a..96b9ecd825 100644
--- a/server/src/data/schema.rs
+++ b/server/src/data/schema.rs
@@ -90,9 +90,9 @@ diesel::table! {
         created_at -> Timestamp,
         updated_at -> Timestamp,
         organization_id -> Uuid,
+        server_configuration -> Jsonb,
         tracking_id -> Nullable<Text>,
         deleted -> Int4,
-        server_configuration -> Jsonb,
     }
 }
 
diff --git a/server/src/handlers/chunk_handler.rs b/server/src/handlers/chunk_handler.rs
index 1830e3b9db..421f2e9ecd 100644
--- a/server/src/handlers/chunk_handler.rs
+++ b/server/src/handlers/chunk_handler.rs
@@ -5,7 +5,7 @@ use crate::data::models::{
     HighlightOptions, IngestSpecificChunkMetadata, Pool, QueryTypes, RagQueryEventClickhouse,
     RecommendType, RecommendationEventClickhouse, RecommendationStrategy, RedisPool, ScoreChunk,
     ScoreChunkDTO, SearchMethod, SearchQueryEventClickhouse, SlimChunkMetadataWithScore,
-    SortByField, SortOptions, UnifiedId, UpdateSpecificChunkMetadata,
+    SortByField, SortOptions, TypoOptions, UnifiedId, UpdateSpecificChunkMetadata,
 };
 use crate::errors::ServiceError;
 use crate::get_env;
@@ -963,6 +963,8 @@ pub struct SearchChunksReqPayload {
     pub remove_stop_words: Option<bool>,
     /// User ID is the id of the user who is making the request. This is used to track user interactions with the search results.
     pub user_id: Option<String>,
+    /// Typo options lets you specify different methods to handle typos in the search query. If not specified, this defaults to no typo handling.
+    pub typo_options: Option<TypoOptions>,
 }
 
 impl Default for SearchChunksReqPayload {
@@ -982,6 +984,7 @@ impl Default for SearchChunksReqPayload {
             use_quote_negated_terms: None,
             remove_stop_words: None,
             user_id: None,
+            typo_options: None,
         }
     }
 }
@@ -990,6 +993,7 @@ impl Default for SearchChunksReqPayload {
 #[schema(title = "V1")]
 pub struct SearchChunkQueryResponseBody {
     pub score_chunks: Vec<ScoreChunkDTO>,
+    pub corrected_query: Option<String>,
     pub total_chunk_pages: i64,
 }
 
@@ -998,6 +1002,7 @@ pub struct SearchChunkQueryResponseBody {
 pub struct SearchResponseBody {
     pub id: uuid::Uuid,
     pub chunks: Vec<ScoreChunk>,
+    pub corrected_query: Option<String>,
     pub total_pages: i64,
 }
 
@@ -1019,6 +1024,7 @@ impl SearchChunkQueryResponseBody {
                 .into_iter()
                 .map(|chunk| chunk.into())
                 .collect(),
+            corrected_query: self.corrected_query,
             total_pages: self.total_chunk_pages,
         }
     }
@@ -1140,18 +1146,21 @@ pub fn parse_query(
         ("ApiKey" = ["readonly"]),
     )
 )]
-#[tracing::instrument(skip(pool, event_queue))]
+#[tracing::instrument(skip(pool, event_queue, redis_pool))]
 pub async fn search_chunks(
-    mut data: web::Json<SearchChunksReqPayload>,
+    data: web::Json<SearchChunksReqPayload>,
     _user: LoggedUser,
     pool: web::Data<Pool>,
     event_queue: web::Data<EventQueue>,
+    redis_pool: web::Data<RedisPool>,
     api_version: APIVersion,
     dataset_org_plan_sub: DatasetAndOrgWithSubAndPlan,
 ) -> Result<HttpResponse, actix_web::Error> {
     let dataset_config =
         DatasetConfiguration::from_json(dataset_org_plan_sub.dataset.server_configuration.clone());
 
+    let mut data = data.into_inner();
+
     let parsed_query = match data.query.clone() {
         QueryTypes::Single(query) => ParsedQueryTypes::Single(parse_query(
             query.clone(),
@@ -1186,6 +1195,7 @@ pub async fn search_chunks(
                 data.clone(),
                 parsed_query.to_parsed_query()?,
                 pool,
+                redis_pool,
                 dataset_org_plan_sub.dataset.clone(),
                 &dataset_config,
                 &mut timer,
@@ -1197,6 +1207,7 @@ pub async fn search_chunks(
                 data.clone(),
                 parsed_query,
                 pool,
+                redis_pool,
                 dataset_org_plan_sub.dataset.clone(),
                 &dataset_config,
                 &mut timer,
@@ -1332,6 +1343,7 @@ pub struct AutocompleteReqPayload {
     pub remove_stop_words: Option<bool>,
     /// User ID is the id of the user who is making the request. This is used to track user interactions with the search results.
     pub user_id: Option<String>,
+    pub typo_options: Option<TypoOptions>,
 }
 
 impl From<AutocompleteReqPayload> for SearchChunksReqPayload {
@@ -1351,6 +1363,7 @@ impl From<AutocompleteReqPayload> for SearchChunksReqPayload {
             use_quote_negated_terms: autocomplete_data.use_quote_negated_terms,
             remove_stop_words: autocomplete_data.remove_stop_words,
             user_id: autocomplete_data.user_id,
+            typo_options: autocomplete_data.typo_options,
         }
     }
 }
@@ -1376,12 +1389,13 @@ impl From<AutocompleteReqPayload> for SearchChunksReqPayload {
         ("ApiKey" = ["readonly"]),
     )
 )]
-#[tracing::instrument(skip(pool, event_queue))]
+#[tracing::instrument(skip(pool, event_queue, redis_pool))]
 pub async fn autocomplete(
     data: web::Json<AutocompleteReqPayload>,
     _user: LoggedUser,
     pool: web::Data<Pool>,
     event_queue: web::Data<EventQueue>,
+    redis_pool: web::Data<RedisPool>,
     api_version: APIVersion,
     dataset_org_plan_sub: DatasetAndOrgWithSubAndPlan,
 ) -> Result<HttpResponse, actix_web::Error> {
@@ -1404,6 +1418,7 @@ pub async fn autocomplete(
         data.clone(),
         parsed_query,
         pool,
+        redis_pool,
         dataset_org_plan_sub.dataset.clone(),
         &dataset_config,
         &mut timer,
@@ -1645,6 +1660,7 @@ impl From<CountChunksReqPayload> for SearchChunksReqPayload {
             use_quote_negated_terms: count_data.use_quote_negated_terms,
             remove_stop_words: None,
             user_id: None,
+            typo_options: None,
         }
     }
 }
diff --git a/server/src/handlers/group_handler.rs b/server/src/handlers/group_handler.rs
index a498ea54c8..87d4b4c468 100644
--- a/server/src/handlers/group_handler.rs
+++ b/server/src/handlers/group_handler.rs
@@ -10,7 +10,7 @@ use crate::{
         ChunkMetadataStringTagSet, DatasetAndOrgWithSubAndPlan, DatasetConfiguration,
         HighlightOptions, Pool, QueryTypes, RecommendType, RecommendationEventClickhouse,
         RecommendationStrategy, RedisPool, ScoreChunk, ScoreChunkDTO, SearchMethod,
-        SearchQueryEventClickhouse, SortOptions, UnifiedId,
+        SearchQueryEventClickhouse, SortOptions, TypoOptions, UnifiedId,
     },
     errors::ServiceError,
     middleware::api_version::APIVersion,
@@ -1298,6 +1298,7 @@ pub async fn get_recommended_groups(
 
     let group_qdrant_query_result = SearchOverGroupsQueryResult {
         search_results: recommended_groups_from_qdrant.clone(),
+        corrected_query: None,
         total_chunk_pages: (recommended_groups_from_qdrant.len() as f64 / 10.0).ceil() as i64,
     };
 
@@ -1409,6 +1410,7 @@ pub struct SearchWithinGroupReqPayload {
     pub remove_stop_words: Option<bool>,
     /// The user_id is the id of the user who is making the request. This is used to track user interactions with the search results.
     pub user_id: Option<String>,
+    pub typo_options: Option<TypoOptions>,
 }
 
 impl From<SearchWithinGroupReqPayload> for SearchChunksReqPayload {
@@ -1428,6 +1430,7 @@ impl From<SearchWithinGroupReqPayload> for SearchChunksReqPayload {
             use_quote_negated_terms: search_within_group_data.use_quote_negated_terms,
             remove_stop_words: search_within_group_data.remove_stop_words,
             user_id: search_within_group_data.user_id,
+            typo_options: search_within_group_data.typo_options,
         }
     }
 }
@@ -1437,6 +1440,7 @@ impl From<SearchWithinGroupReqPayload> for SearchChunksReqPayload {
 pub struct SearchWithinGroupResults {
     pub bookmarks: Vec<ScoreChunkDTO>,
     pub group: ChunkGroupAndFileId,
+    pub corrected_query: Option<String>,
     pub total_pages: i64,
 }
 
@@ -1445,6 +1449,7 @@ pub struct SearchWithinGroupResults {
 pub struct SearchWithinGroupResponseBody {
     pub id: uuid::Uuid,
     pub chunks: Vec<ScoreChunk>,
+    pub corrected_query: Option<String>,
     pub total_pages: i64,
 }
 
@@ -1466,6 +1471,7 @@ impl SearchWithinGroupResults {
                 .into_iter()
                 .map(|chunk| chunk.into())
                 .collect(),
+            corrected_query: self.corrected_query,
             total_pages: self.total_pages,
         }
     }
@@ -1492,11 +1498,12 @@ impl SearchWithinGroupResults {
         ("ApiKey" = ["readonly"]),
     )
 )]
-#[tracing::instrument(skip(pool, event_queue))]
+#[tracing::instrument(skip(pool, event_queue, redis_pool))]
 pub async fn search_within_group(
     data: web::Json<SearchWithinGroupReqPayload>,
     pool: web::Data<Pool>,
     event_queue: web::Data<EventQueue>,
+    redis_pool: web::Data<RedisPool>,
     api_version: APIVersion,
     _required_user: LoggedUser,
     dataset_org_plan_sub: DatasetAndOrgWithSubAndPlan,
@@ -1504,6 +1511,8 @@ pub async fn search_within_group(
     let dataset_config =
         DatasetConfiguration::from_json(dataset_org_plan_sub.dataset.server_configuration.clone());
 
+    let data = data.into_inner();
+
     //search over the links as well
     let group_id = data.group_id;
     let dataset_id = dataset_org_plan_sub.dataset.id;
@@ -1551,8 +1560,10 @@ pub async fn search_within_group(
                 parsed_query.to_parsed_query()?,
                 group,
                 search_pool,
+                redis_pool,
                 dataset_org_plan_sub.dataset.clone(),
                 &dataset_config,
+                &mut timer,
             )
             .await?
         }
@@ -1562,8 +1573,10 @@ pub async fn search_within_group(
                 parsed_query,
                 group,
                 search_pool,
+                redis_pool,
                 dataset_org_plan_sub.dataset.clone(),
                 &dataset_config,
+                &mut timer,
             )
             .await?
         }
@@ -1642,6 +1655,7 @@ pub struct SearchOverGroupsReqPayload {
     pub remove_stop_words: Option<bool>,
     /// The user_id is the id of the user who is making the request. This is used to track user interactions with the search results.
     pub user_id: Option<String>,
+    pub typo_options: Option<TypoOptions>,
 }
 
 /// Search Over Groups
@@ -1665,11 +1679,12 @@ pub struct SearchOverGroupsReqPayload {
         ("ApiKey" = ["readonly"]),
     )
 )]
-#[tracing::instrument(skip(pool, event_queue))]
+#[tracing::instrument(skip(pool, event_queue, redis_pool))]
 pub async fn search_over_groups(
     data: web::Json<SearchOverGroupsReqPayload>,
     pool: web::Data<Pool>,
     event_queue: web::Data<EventQueue>,
+    redis_pool: web::Data<RedisPool>,
     api_version: APIVersion,
     _required_user: LoggedUser,
     dataset_org_plan_sub: DatasetAndOrgWithSubAndPlan,
@@ -1713,6 +1728,7 @@ pub async fn search_over_groups(
                 data.clone(),
                 parsed_query,
                 pool,
+                redis_pool,
                 dataset_org_plan_sub.dataset.clone(),
                 &dataset_config,
                 &mut timer,
@@ -1724,6 +1740,7 @@ pub async fn search_over_groups(
                 data.clone(),
                 parsed_query.to_parsed_query()?,
                 pool,
+                redis_pool,
                 dataset_org_plan_sub.dataset.clone(),
                 &dataset_config,
                 &mut timer,
@@ -1741,6 +1758,7 @@ pub async fn search_over_groups(
                 data.clone(),
                 parsed_query,
                 pool,
+                redis_pool,
                 dataset_org_plan_sub.dataset.clone(),
                 &dataset_config,
                 &mut timer,
diff --git a/server/src/handlers/message_handler.rs b/server/src/handlers/message_handler.rs
index a9b4733bfb..527cb248c5 100644
--- a/server/src/handlers/message_handler.rs
+++ b/server/src/handlers/message_handler.rs
@@ -5,7 +5,7 @@ use super::{
 use crate::{
     data::models::{
         self, ChunkMetadata, DatasetAndOrgWithSubAndPlan, DatasetConfiguration, HighlightOptions,
-        LLMOptions, Pool, SearchMethod, SuggestType,
+        LLMOptions, Pool, RedisPool, SearchMethod, SuggestType,
     },
     errors::ServiceError,
     get_env,
@@ -131,6 +131,7 @@ pub async fn create_message(
     dataset_org_plan_sub: DatasetAndOrgWithSubAndPlan,
     event_queue: web::Data<EventQueue>,
     pool: web::Data<Pool>,
+    redis_pool: web::Data<RedisPool>,
 ) -> Result<HttpResponse, actix_web::Error> {
     let message_count_pool = pool.clone();
     let message_count_org_id = dataset_org_plan_sub.organization.organization.id;
@@ -225,6 +226,7 @@ pub async fn create_message(
         dataset_org_plan_sub.dataset,
         stream_response_pool,
         event_queue,
+        redis_pool,
         dataset_config,
         create_message_data,
     )
@@ -389,13 +391,14 @@ impl From<RegenerateMessageReqPayload> for CreateMessageReqPayload {
         ("ApiKey" = ["readonly"]),
     )
 )]
-#[tracing::instrument(skip(pool, event_queue))]
+#[tracing::instrument(skip(pool, event_queue, redis_pool))]
 pub async fn edit_message(
     data: web::Json<EditMessageReqPayload>,
     user: AdminOnly,
     dataset_org_plan_sub: DatasetAndOrgWithSubAndPlan,
     pool: web::Data<Pool>,
     event_queue: web::Data<EventQueue>,
+    redis_pool: web::Data<RedisPool>,
 ) -> Result<HttpResponse, actix_web::Error> {
     let topic_id: uuid::Uuid = data.topic_id;
     let message_sort_order = data.message_sort_order;
@@ -428,6 +431,7 @@ pub async fn edit_message(
         dataset_org_plan_sub,
         event_queue,
         third_pool,
+        redis_pool,
     )
     .await
 }
@@ -453,13 +457,14 @@ pub async fn edit_message(
         ("ApiKey" = ["readonly"]),
     )
 )]
-#[tracing::instrument(skip(pool, event_queue))]
+#[tracing::instrument(skip(pool, event_queue, redis_pool))]
 pub async fn regenerate_message_patch(
     data: web::Json<RegenerateMessageReqPayload>,
     user: AdminOnly,
     dataset_org_plan_sub: DatasetAndOrgWithSubAndPlan,
     pool: web::Data<Pool>,
     event_queue: web::Data<EventQueue>,
+    redis_pool: web::Data<RedisPool>,
 ) -> Result<HttpResponse, actix_web::Error> {
     let topic_id = data.topic_id;
     let dataset_config =
@@ -487,6 +492,7 @@ pub async fn regenerate_message_patch(
             dataset_org_plan_sub.dataset,
             create_message_pool,
             event_queue,
+            redis_pool.clone(),
             dataset_config,
             data.into_inner().into(),
         )
@@ -559,6 +565,7 @@ pub async fn regenerate_message_patch(
         dataset_org_plan_sub.dataset,
         create_message_pool,
         event_queue,
+        redis_pool.clone(),
         dataset_config,
         data.into_inner().into(),
     )
@@ -587,15 +594,24 @@ pub async fn regenerate_message_patch(
     )
 )]
 #[deprecated]
-#[tracing::instrument(skip(pool, event_queue))]
+#[tracing::instrument(skip(pool, event_queue, redis_pool))]
 pub async fn regenerate_message(
     data: web::Json<RegenerateMessageReqPayload>,
     user: AdminOnly,
     dataset_org_plan_sub: DatasetAndOrgWithSubAndPlan,
     pool: web::Data<Pool>,
     event_queue: web::Data<EventQueue>,
+    redis_pool: web::Data<RedisPool>,
 ) -> Result<HttpResponse, actix_web::Error> {
-    regenerate_message_patch(data, user, dataset_org_plan_sub, pool, event_queue).await
+    regenerate_message_patch(
+        data,
+        user,
+        dataset_org_plan_sub,
+        pool,
+        event_queue,
+        redis_pool,
+    )
+    .await
 }
 
 #[derive(Deserialize, Serialize, Debug, ToSchema)]
@@ -643,6 +659,7 @@ pub async fn get_suggested_queries(
     data: web::Json<SuggestedQueriesReqPayload>,
     dataset_org_plan_sub: DatasetAndOrgWithSubAndPlan,
     pool: web::Data<Pool>,
+    redis_pool: web::Data<RedisPool>,
     _required_user: LoggedUser,
 ) -> Result<HttpResponse, ServiceError> {
     let dataset_id = dataset_org_plan_sub.dataset.id;
@@ -669,7 +686,6 @@ pub async fn get_suggested_queries(
         )
         .into()
     };
-
     let search_type = data.search_type.clone().unwrap_or(SearchMethod::Hybrid);
     let filters = data.filters.clone();
 
@@ -692,6 +708,7 @@ pub async fn get_suggested_queries(
                     search_req_payload,
                     parsed_query,
                     pool,
+                    redis_pool,
                     dataset_org_plan_sub.dataset.clone(),
                     &dataset_config,
                     &mut Timer::new(),
@@ -702,6 +719,7 @@ pub async fn get_suggested_queries(
                     search_req_payload,
                     ParsedQueryTypes::Single(parsed_query),
                     pool,
+                    redis_pool,
                     dataset_org_plan_sub.dataset.clone(),
                     &dataset_config,
                     &mut Timer::new(),
@@ -865,6 +883,7 @@ pub async fn get_suggested_queries(
             Some(cleaned_query)
         }
     })
+    .map(|query| query.to_string().trim().trim_matches('\n').to_string())
     .collect();
 
     while queries.len() < 3 {
diff --git a/server/src/lib.rs b/server/src/lib.rs
index 6015155d90..f264b713fc 100644
--- a/server/src/lib.rs
+++ b/server/src/lib.rs
@@ -9,7 +9,7 @@ use crate::{
     handlers::{auth_handler::build_oidc_client, metrics_handler::Metrics},
     operators::{
         clickhouse_operator::EventQueue, qdrant_operator::create_new_qdrant_collection_query,
-        user_operator::create_default_user,
+        user_operator::create_default_user, words_operator::BKTreeCache,
     },
 };
 use actix_cors::Cors;
@@ -411,6 +411,8 @@ impl Modify for SecurityAddon {
             data::models::SortOptions,
             data::models::LLMOptions,
             data::models::HighlightOptions,
+            data::models::TypoOptions,
+            data::models::TypoRange,
             data::models::SortByField,
             data::models::SortBySearchType,
             data::models::ReRankOptions,
@@ -629,6 +631,8 @@ pub fn main() -> std::io::Result<()> {
             (clickhouse::Client::default(), EventQueue::default())
         };
 
+        BKTreeCache::enforce_cache_ttl();
+
 
         let metrics = Metrics::new().map_err(|e| {
             std::io::Error::new(std::io::ErrorKind::Other, format!("Failed to create metrics {:?}", e))
diff --git a/server/src/operators/chunk_operator.rs b/server/src/operators/chunk_operator.rs
index a4baaf78f0..a9fa4e765e 100644
--- a/server/src/operators/chunk_operator.rs
+++ b/server/src/operators/chunk_operator.rs
@@ -18,6 +18,7 @@ use crate::{
 };
 use actix_web::web;
 use chrono::NaiveDateTime;
+use clickhouse::Row;
 use dateparser::DateTimeUtc;
 use diesel::dsl::{not, sql};
 use diesel::prelude::*;
@@ -29,6 +30,7 @@ use itertools::Itertools;
 use serde::{Deserialize, Serialize};
 use simsearch::{SearchOptions, SimSearch};
 use std::collections::{HashMap, HashSet};
+use time::OffsetDateTime;
 use utoipa::ToSchema;
 
 use super::group_operator::create_groups_query;
@@ -2427,3 +2429,103 @@ pub async fn get_pg_point_ids_from_qdrant_point_ids(
 
     Ok(chunk_ids)
 }
+
+#[tracing::instrument(skip(pool))]
+pub async fn get_chunk_html_from_ids_query(
+    chunk_ids: Vec<uuid::Uuid>,
+    pool: web::Data<Pool>,
+) -> Result<Option<Vec<(uuid::Uuid, String)>>, ServiceError> {
+    use crate::data::schema::chunk_metadata::dsl as chunk_metadata_columns;
+    let mut conn = pool.get().await.unwrap();
+
+    let chunk_htmls = chunk_metadata_columns::chunk_metadata
+        .select((
+            chunk_metadata_columns::id,
+            chunk_metadata_columns::chunk_html.assume_not_null(),
+        ))
+        .filter(chunk_metadata_columns::id.eq_any(chunk_ids))
+        .load::<(uuid::Uuid, String)>(&mut conn)
+        .await
+        .map_err(|_| ServiceError::NotFound("Failed to get chunk_htmls".to_string()))?;
+
+    if chunk_htmls.is_empty() {
+        return Ok(None);
+    }
+    Ok(Some(chunk_htmls))
+}
+
+pub async fn scroll_chunk_ids_for_dictionary_query(
+    pool: web::Data<Pool>,
+    dataset_id: uuid::Uuid,
+    last_processed: Option<DatasetLastProcessed>,
+    limit: i64,
+    offset: uuid::Uuid,
+) -> Result<Option<Vec<(uuid::Uuid, uuid::Uuid)>>, ServiceError> {
+    use crate::data::schema::chunk_metadata::dsl as chunk_metadata_columns;
+
+    let mut conn = pool
+        .get()
+        .await
+        .map_err(|_| ServiceError::BadRequest("Could not get database connection".to_string()))?;
+
+    let mut chunk_ids = chunk_metadata_columns::chunk_metadata
+        .select((
+            chunk_metadata_columns::id,
+            chunk_metadata_columns::dataset_id,
+        ))
+        .filter(chunk_metadata_columns::dataset_id.eq(dataset_id))
+        .filter(chunk_metadata_columns::id.gt(offset))
+        .into_boxed();
+
+    if let Some(last_processed) = last_processed {
+        let last_processed =
+            NaiveDateTime::from_timestamp(last_processed.last_processed.unix_timestamp(), 0);
+
+        chunk_ids = chunk_ids.filter(chunk_metadata_columns::created_at.gt(last_processed));
+    }
+
+    let chunk_ids = chunk_ids
+        .order_by(chunk_metadata_columns::id)
+        .limit(limit)
+        .load::<(uuid::Uuid, uuid::Uuid)>(&mut conn)
+        .await
+        .map_err(|_| {
+            log::error!("Failed to scroll dataset ids for dictionary");
+            ServiceError::InternalServerError(
+                "Failed to scroll dataset ids for dictionary".to_string(),
+            )
+        })?;
+
+    if chunk_ids.is_empty() {
+        return Ok(None);
+    }
+    Ok(Some(chunk_ids))
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize, Row)]
+pub struct DatasetLastProcessed {
+    #[serde(with = "clickhouse::serde::uuid")]
+    pub dataset_id: uuid::Uuid,
+    #[serde(with = "clickhouse::serde::time::datetime")]
+    pub last_processed: OffsetDateTime,
+}
+
+pub async fn get_last_processed_from_clickhouse(
+    clickhouse_client: &clickhouse::Client,
+    dataset_id: uuid::Uuid,
+) -> Result<Option<DatasetLastProcessed>, ServiceError> {
+    let query = format!(
+        "SELECT dataset_id, min(last_processed) as last_processed FROM dataset_words_last_processed WHERE dataset_id = '{}' GROUP BY dataset_id LIMIT 1",
+        dataset_id
+    );
+
+    let last_processed = clickhouse_client
+        .query(&query)
+        .fetch_optional::<DatasetLastProcessed>()
+        .await
+        .map_err(|_| {
+            ServiceError::InternalServerError("Failed to get last processed".to_string())
+        })?;
+
+    Ok(last_processed)
+}
diff --git a/server/src/operators/dataset_operator.rs b/server/src/operators/dataset_operator.rs
index 3658833b2b..2f044c65a3 100644
--- a/server/src/operators/dataset_operator.rs
+++ b/server/src/operators/dataset_operator.rs
@@ -1,6 +1,7 @@
 use crate::data::models::{
     DatasetAndOrgWithSubAndPlan, DatasetAndUsage, DatasetConfiguration, DatasetUsageCount,
     Organization, OrganizationWithSubAndPlan, RedisPool, StripePlan, StripeSubscription, UnifiedId,
+    WordDataset,
 };
 use crate::handlers::dataset_handler::{GetDatasetsPagination, TagsWithCount};
 use crate::operators::clickhouse_operator::ClickHouseEvent;
@@ -12,11 +13,14 @@ use crate::{
     errors::ServiceError,
 };
 use actix_web::web;
+use clickhouse::Row;
 use diesel::dsl::count;
 use diesel::prelude::*;
 use diesel::result::{DatabaseErrorKind, Error as DBError};
 use diesel_async::RunQueryDsl;
+use itertools::Itertools;
 use serde::{Deserialize, Serialize};
+use time::{format_description, OffsetDateTime};
 
 use super::clickhouse_operator::EventQueue;
 
@@ -109,6 +113,24 @@ pub async fn get_deleted_dataset_by_unifiedid_query(
     Ok(dataset)
 }
 
+#[tracing::instrument(skip(pool))]
+pub async fn get_all_dataset_ids(pool: web::Data<Pool>) -> Result<Vec<uuid::Uuid>, ServiceError> {
+    use crate::data::schema::datasets::dsl as datasets_columns;
+    let mut conn = pool
+        .get()
+        .await
+        .map_err(|_| ServiceError::BadRequest("Could not get database connection".to_string()))?;
+
+    let datasets = datasets_columns::datasets
+        .select(datasets_columns::id)
+        .filter(datasets_columns::deleted.eq(0))
+        .load::<uuid::Uuid>(&mut conn)
+        .await
+        .map_err(|_| ServiceError::NotFound("Could not find dataset".to_string()))?;
+
+    Ok(datasets)
+}
+
 #[tracing::instrument(skip(pool))]
 pub async fn get_dataset_and_organization_from_dataset_id_query(
     id: UnifiedId,
@@ -668,3 +690,166 @@ pub async fn get_tags_in_dataset_query(
 
     Ok((items, total_count))
 }
+
+pub async fn scroll_dataset_ids_query(
+    offset: uuid::Uuid,
+    limit: i64,
+    pool: web::Data<Pool>,
+) -> Result<Option<Vec<uuid::Uuid>>, ServiceError> {
+    use crate::data::schema::datasets::dsl as datasets_columns;
+
+    let mut conn = pool
+        .get()
+        .await
+        .map_err(|_| ServiceError::BadRequest("Could not get database connection".to_string()))?;
+
+    let datasets = datasets_columns::datasets
+        .select(datasets_columns::id)
+        .filter(datasets_columns::id.gt(offset))
+        .order_by(datasets_columns::id)
+        .limit(limit)
+        .load::<uuid::Uuid>(&mut conn)
+        .await
+        .map_err(|_| ServiceError::NotFound("Failed to get datasets".to_string()))?;
+
+    if datasets.is_empty() {
+        return Ok(None);
+    }
+    Ok(Some(datasets))
+}
+
+pub async fn add_words_to_dataset(
+    words: Vec<String>,
+    counts: Vec<i32>,
+    dataset_ids: Vec<uuid::Uuid>,
+    clickhouse_client: &clickhouse::Client,
+) -> Result<(), ServiceError> {
+    let rows = words
+        .into_iter()
+        .zip(counts)
+        .zip(dataset_ids)
+        .map(|((w, count), dataset_id)| WordDataset::from_details(w, dataset_id, count))
+        .collect_vec();
+
+    let mut words_inserter = clickhouse_client
+        .insert("default.words_datasets")
+        .map_err(|e| {
+            log::error!("Error inserting words_datasets: {:?}", e);
+            sentry::capture_message("Error inserting words_datasets", sentry::Level::Error);
+            ServiceError::InternalServerError(format!("Error inserting words_datasets: {:?}", e))
+        })?;
+
+    for row in rows {
+        words_inserter.write(&row).await.map_err(|e| {
+            log::error!("Error inserting words_datasets: {:?}", e);
+            sentry::capture_message("Error inserting words_datasets", sentry::Level::Error);
+            ServiceError::InternalServerError(format!("Error inserting words_datasets: {:?}", e))
+        })?;
+    }
+
+    words_inserter.end().await.map_err(|e| {
+        log::error!("Error inserting words_datasets: {:?}", e);
+        sentry::capture_message("Error inserting words_datasets", sentry::Level::Error);
+        ServiceError::InternalServerError(format!("Error inserting words_datasets: {:?}", e))
+    })?;
+
+    Ok(())
+}
+
+#[derive(Serialize, Deserialize, Clone, Debug, Row)]
+pub struct WordDatasetCount {
+    #[serde(with = "clickhouse::serde::uuid")]
+    pub id: uuid::Uuid,
+    pub word: String,
+    pub count: i32,
+}
+
+#[tracing::instrument(skip(clickhouse_client))]
+pub async fn scroll_words_from_dataset(
+    dataset_id: uuid::Uuid,
+    offset: uuid::Uuid,
+    last_processed: Option<OffsetDateTime>,
+    limit: i64,
+    clickhouse_client: &clickhouse::Client,
+) -> Result<Option<Vec<WordDatasetCount>>, ServiceError> {
+    let mut query = format!(
+        "
+       SELECT 
+            id,
+            word,
+            count,
+        FROM words_datasets
+        WHERE dataset_id = '{}' AND id > '{}' 
+        ",
+        dataset_id, offset,
+    );
+
+    if let Some(last_processed) = last_processed {
+        query = format!(
+            "{} AND created_at >= '{}'",
+            query,
+            last_processed
+                .format(
+                    &format_description::parse("[year]-[month]-[day] [hour]:[minute]:[second]",)
+                        .unwrap()
+                )
+                .map_err(|e| {
+                    log::error!("Error formatting last processed time: {:?}", e);
+                    sentry::capture_message(
+                        "Error formatting last processed time",
+                        sentry::Level::Error,
+                    );
+                    ServiceError::InternalServerError(format!(
+                        "Error formatting last processed time: {:?}",
+                        e
+                    ))
+                })?
+        );
+    }
+
+    query = format!("{} ORDER BY id LIMIT {}", query, limit);
+
+    let words = clickhouse_client
+        .query(&query)
+        .fetch_all::<WordDatasetCount>()
+        .await
+        .map_err(|e| {
+            log::error!("Error fetching words from dataset: {:?}", e);
+            sentry::capture_message("Error fetching words from dataset", sentry::Level::Error);
+            ServiceError::InternalServerError(format!("Error fetching words from dataset: {:?}", e))
+        })?;
+
+    if words.is_empty() {
+        Ok(None)
+    } else {
+        Ok(Some(words))
+    }
+}
+
+pub async fn update_dataset_last_processed_query(
+    dataset_id: uuid::Uuid,
+    clickhouse_client: &clickhouse::Client,
+) -> Result<(), ServiceError> {
+    let query = format!(
+        "
+        INSERT INTO dataset_words_last_processed (dataset_id, last_processed)
+        VALUES ('{}', now())
+        ",
+        dataset_id
+    );
+
+    clickhouse_client
+        .query(&query)
+        .execute()
+        .await
+        .map_err(|e| {
+            log::error!("Error updating last processed time: {:?}", e);
+            sentry::capture_message("Error updating last processed time", sentry::Level::Error);
+            ServiceError::InternalServerError(format!(
+                "Error updating last processed time: {:?}",
+                e
+            ))
+        })?;
+
+    Ok(())
+}
diff --git a/server/src/operators/message_operator.rs b/server/src/operators/message_operator.rs
index fd93ef733d..865e780788 100644
--- a/server/src/operators/message_operator.rs
+++ b/server/src/operators/message_operator.rs
@@ -1,6 +1,6 @@
 use crate::data::models::{
     self, ChunkMetadataStringTagSet, ChunkMetadataTypes, Dataset, DatasetConfiguration, QueryTypes,
-    RagQueryEventClickhouse, SearchMethod,
+    RagQueryEventClickhouse, RedisPool, SearchMethod,
 };
 use crate::diesel::prelude::*;
 use crate::get_env;
@@ -222,13 +222,15 @@ pub async fn delete_message_query(
     Ok(())
 }
 
-#[tracing::instrument(skip(pool, event_queue))]
+#[allow(clippy::too_many_arguments)]
+#[tracing::instrument(skip(pool, redis_pool, event_queue))]
 pub async fn stream_response(
     messages: Vec<models::Message>,
     topic_id: uuid::Uuid,
     dataset: Dataset,
     pool: web::Data<Pool>,
     event_queue: web::Data<EventQueue>,
+    redis_pool: web::Data<RedisPool>,
     dataset_config: DatasetConfiguration,
     create_message_req_payload: CreateMessageReqPayload,
 ) -> Result<HttpResponse, actix_web::Error> {
@@ -374,6 +376,7 @@ pub async fn stream_response(
         search_chunk_data.clone(),
         parsed_query,
         pool.clone(),
+        redis_pool,
         dataset.clone(),
         &dataset_config,
         &mut search_timer,
diff --git a/server/src/operators/mod.rs b/server/src/operators/mod.rs
index 9496604d8e..03876b1ab0 100644
--- a/server/src/operators/mod.rs
+++ b/server/src/operators/mod.rs
@@ -16,3 +16,4 @@ pub mod search_operator;
 pub mod stripe_operator;
 pub mod topic_operator;
 pub mod user_operator;
+pub mod words_operator;
diff --git a/server/src/operators/search_operator.rs b/server/src/operators/search_operator.rs
index 01a16386a7..4578492e3b 100644
--- a/server/src/operators/search_operator.rs
+++ b/server/src/operators/search_operator.rs
@@ -12,10 +12,11 @@ use super::model_operator::{
 use super::qdrant_operator::{
     count_qdrant_query, search_over_groups_query, GroupSearchResults, QdrantSearchQuery, VectorType,
 };
+use super::words_operator::correct_query;
 use crate::data::models::{
     convert_to_date_time, ChunkGroup, ChunkGroupAndFileId, ChunkMetadata, ChunkMetadataTypes,
     ConditionType, ContentChunkMetadata, Dataset, DatasetConfiguration, GeoInfoWithBias,
-    HasIDCondition, QdrantSortBy, QueryTypes, ReRankOptions, ScoreChunk, ScoreChunkDTO,
+    HasIDCondition, QdrantSortBy, QueryTypes, ReRankOptions, RedisPool, ScoreChunk, ScoreChunkDTO,
     SearchMethod, SlimChunkMetadata, SortByField, SortBySearchType, SortOrder, UnifiedId,
 };
 use crate::handlers::chunk_handler::{
@@ -985,6 +986,7 @@ pub async fn get_group_tag_set_filter_condition(
 #[derive(Serialize, Deserialize, Clone, Debug)]
 pub struct SearchOverGroupsQueryResult {
     pub search_results: Vec<GroupSearchResults>,
+    pub corrected_query: Option<String>,
     pub total_chunk_pages: i64,
 }
 
@@ -1042,6 +1044,7 @@ pub async fn retrieve_group_qdrant_points_query(
 
     Ok(SearchOverGroupsQueryResult {
         search_results: point_ids,
+        corrected_query: None,
         total_chunk_pages: pages,
     })
 }
@@ -1104,6 +1107,7 @@ impl From<GroupScoreChunk> for SearchOverGroupsResults {
 #[schema(title = "V1")]
 pub struct DeprecatedSearchOverGroupsResponseBody {
     pub group_chunks: Vec<GroupScoreChunk>,
+    pub corrected_query: Option<String>,
     pub total_chunk_pages: i64,
 }
 
@@ -1116,6 +1120,7 @@ impl DeprecatedSearchOverGroupsResponseBody {
                 .into_iter()
                 .map(|chunk| chunk.into())
                 .collect(),
+            corrected_query: self.corrected_query,
             total_pages: self.total_chunk_pages,
         }
     }
@@ -1126,6 +1131,7 @@ impl DeprecatedSearchOverGroupsResponseBody {
 pub struct SearchOverGroupsResponseBody {
     pub id: uuid::Uuid,
     pub results: Vec<SearchOverGroupsResults>,
+    pub corrected_query: Option<String>,
     pub total_pages: i64,
 }
 
@@ -1288,6 +1294,7 @@ pub async fn retrieve_chunks_for_groups(
 
     Ok(DeprecatedSearchOverGroupsResponseBody {
         group_chunks,
+        corrected_query: None,
         total_chunk_pages: search_over_groups_query_result.total_chunk_pages,
     })
 }
@@ -1529,6 +1536,7 @@ pub async fn retrieve_chunks_from_point_ids(
 
     Ok(SearchChunkQueryResponseBody {
         score_chunks,
+        corrected_query: None,
         total_chunk_pages: search_chunk_query_results.total_chunk_pages,
     })
 }
@@ -1780,11 +1788,12 @@ async fn get_qdrant_vector(
     }
 }
 
-#[tracing::instrument(skip(timer, pool))]
+#[tracing::instrument(skip(timer, pool, redis_pool))]
 pub async fn search_chunks_query(
-    data: SearchChunksReqPayload,
+    mut data: SearchChunksReqPayload,
     parsed_query: ParsedQueryTypes,
     pool: web::Data<Pool>,
+    redis_pool: web::Data<RedisPool>,
     dataset: Dataset,
     config: &DatasetConfiguration,
     timer: &mut Timer,
@@ -1801,12 +1810,36 @@ pub async fn search_chunks_query(
     };
     sentry::configure_scope(|scope| scope.set_span(Some(transaction.clone())));
 
-    timer.add("start to create dense embedding vector");
+    let mut parsed_query = parsed_query.clone();
+    let mut corrected_query = None;
+
+    if let Some(options) = &data.typo_options {
+        timer.add("start correcting query");
+        match parsed_query {
+            ParsedQueryTypes::Single(ref mut query) => {
+                corrected_query =
+                    correct_query(query.query.clone(), dataset.id, redis_pool, options).await?;
+                query.query = corrected_query.clone().unwrap_or(query.query.clone());
+                data.query = QueryTypes::Single(query.query.clone());
+            }
+            ParsedQueryTypes::Multi(ref mut queries) => {
+                for (query, _) in queries {
+                    corrected_query =
+                        correct_query(query.query.clone(), dataset.id, redis_pool.clone(), options)
+                            .await?;
+                    query.query = corrected_query.clone().unwrap_or(query.query.clone());
+                }
+            }
+        }
+        timer.add("corrected query");
+    }
 
-    timer.add("computed dense embedding");
+    timer.add("start to create dense embedding vector");
 
     let vector = get_qdrant_vector(data.clone().search_type, parsed_query.clone(), config).await?;
 
+    timer.add("computed dense embedding");
+
     let (sort_by, rerank_by) = match data.sort_options.as_ref().map(|d| d.sort_by.clone()) {
         Some(Some(sort_by)) => match sort_by {
             QdrantSortBy::Field(field) => (Some(field.clone()), None),
@@ -1893,15 +1926,18 @@ pub async fn search_chunks_query(
     timer.add("reranking");
     transaction.finish();
 
+    result_chunks.corrected_query = corrected_query;
+
     Ok(result_chunks)
 }
 
 #[allow(clippy::too_many_arguments)]
-#[tracing::instrument(skip(timer, pool))]
+#[tracing::instrument(skip(timer, pool, redis_pool))]
 pub async fn search_hybrid_chunks(
-    data: SearchChunksReqPayload,
+    mut data: SearchChunksReqPayload,
     parsed_query: ParsedQuery,
     pool: web::Data<Pool>,
+    redis_pool: web::Data<RedisPool>,
     dataset: Dataset,
     config: &DatasetConfiguration,
     timer: &mut Timer,
@@ -1918,6 +1954,21 @@ pub async fn search_hybrid_chunks(
     };
     sentry::configure_scope(|scope| scope.set_span(Some(transaction.clone())));
 
+    let mut parsed_query = parsed_query.clone();
+    let mut corrected_query = None;
+
+    if let Some(options) = &data.typo_options {
+        timer.add("start correcting query");
+        corrected_query =
+            correct_query(parsed_query.query.clone(), dataset.id, redis_pool, options).await?;
+        parsed_query.query = corrected_query
+            .clone()
+            .unwrap_or(parsed_query.query.clone());
+        data.query = QueryTypes::Single(parsed_query.query.clone());
+
+        timer.add("corrected query");
+    }
+
     let dataset_config = DatasetConfiguration::from_json(dataset.server_configuration.clone());
 
     let dense_vector_future = get_dense_vector(
@@ -2033,6 +2084,7 @@ pub async fn search_hybrid_chunks(
 
         SearchChunkQueryResponseBody {
             score_chunks: reranked_chunks,
+            corrected_query,
             total_chunk_pages: result_chunks.total_chunk_pages,
         }
     };
@@ -2066,17 +2118,43 @@ pub async fn search_hybrid_chunks(
 }
 
 #[allow(clippy::too_many_arguments)]
-#[tracing::instrument(skip(pool))]
+#[tracing::instrument(skip(pool, timer, redis_pool))]
 pub async fn search_groups_query(
-    data: SearchWithinGroupReqPayload,
+    mut data: SearchWithinGroupReqPayload,
     parsed_query: ParsedQueryTypes,
     group: ChunkGroupAndFileId,
     pool: web::Data<Pool>,
+    redis_pool: web::Data<RedisPool>,
     dataset: Dataset,
     config: &DatasetConfiguration,
+    timer: &mut Timer,
 ) -> Result<SearchWithinGroupResults, actix_web::Error> {
     let vector = get_qdrant_vector(data.clone().search_type, parsed_query.clone(), config).await?;
 
+    let mut parsed_query = parsed_query.clone();
+    let mut corrected_query = None;
+
+    if let Some(options) = &data.typo_options {
+        timer.add("start correcting query");
+        match parsed_query {
+            ParsedQueryTypes::Single(ref mut query) => {
+                corrected_query =
+                    correct_query(query.query.clone(), dataset.id, redis_pool, options).await?;
+                query.query = corrected_query.clone().unwrap_or(query.query.clone());
+                data.query = QueryTypes::Single(query.query.clone());
+            }
+            ParsedQueryTypes::Multi(ref mut queries) => {
+                for (query, _) in queries {
+                    corrected_query =
+                        correct_query(query.query.clone(), dataset.id, redis_pool.clone(), options)
+                            .await?;
+                    query.query = corrected_query.clone().unwrap_or(query.query.clone());
+                }
+            }
+        }
+        timer.add("corrected query");
+    }
+
     let (sort_by, rerank_by) = match data.sort_options.as_ref().map(|d| d.sort_by.clone()) {
         Some(Some(sort_by)) => match sort_by {
             QdrantSortBy::Field(field) => (Some(field.clone()), None),
@@ -2161,24 +2239,42 @@ pub async fn search_groups_query(
     Ok(SearchWithinGroupResults {
         bookmarks: result_chunks.score_chunks,
         group,
+        corrected_query,
         total_pages: result_chunks.total_chunk_pages,
     })
 }
 
 #[allow(clippy::too_many_arguments)]
-#[tracing::instrument(skip(pool))]
+#[tracing::instrument(skip(pool, timer, redis_pool))]
 pub async fn search_hybrid_groups(
-    data: SearchWithinGroupReqPayload,
+    mut data: SearchWithinGroupReqPayload,
     parsed_query: ParsedQuery,
     group: ChunkGroupAndFileId,
     pool: web::Data<Pool>,
+    redis_pool: web::Data<RedisPool>,
     dataset: Dataset,
     config: &DatasetConfiguration,
+    timer: &mut Timer,
 ) -> Result<SearchWithinGroupResults, actix_web::Error> {
     let dataset_config = DatasetConfiguration::from_json(dataset.server_configuration.clone());
 
+    let mut parsed_query = parsed_query.clone();
+    let mut corrected_query = None;
+
+    if let Some(options) = &data.typo_options {
+        timer.add("start correcting query");
+        corrected_query =
+            correct_query(parsed_query.query.clone(), dataset.id, redis_pool, options).await?;
+        parsed_query.query = corrected_query
+            .clone()
+            .unwrap_or(parsed_query.query.clone());
+        data.query = QueryTypes::Single(parsed_query.query.clone());
+
+        timer.add("corrected query");
+    }
+
     let dense_vector_future = get_dense_vector(
-        data.query.clone().to_single_query()?,
+        parsed_query.query.clone(),
         None,
         "query",
         dataset_config.clone(),
@@ -2328,6 +2424,7 @@ pub async fn search_hybrid_groups(
 
         SearchChunkQueryResponseBody {
             score_chunks: reranked_chunks,
+            corrected_query: None,
             total_chunk_pages: result_chunks.total_chunk_pages,
         }
     };
@@ -2335,21 +2432,47 @@ pub async fn search_hybrid_groups(
     Ok(SearchWithinGroupResults {
         bookmarks: reranked_chunks.score_chunks,
         group,
+        corrected_query,
         total_pages: result_chunks.total_chunk_pages,
     })
 }
 
-#[tracing::instrument(skip(timer, pool))]
+#[tracing::instrument(skip(timer, pool, redis_pool))]
 pub async fn semantic_search_over_groups(
-    data: SearchOverGroupsReqPayload,
+    mut data: SearchOverGroupsReqPayload,
     parsed_query: ParsedQueryTypes,
     pool: web::Data<Pool>,
+    redis_pool: web::Data<RedisPool>,
     dataset: Dataset,
     config: &DatasetConfiguration,
     timer: &mut Timer,
 ) -> Result<DeprecatedSearchOverGroupsResponseBody, actix_web::Error> {
     let dataset_config = DatasetConfiguration::from_json(dataset.server_configuration.clone());
 
+    let mut parsed_query = parsed_query.clone();
+    let mut corrected_query = None;
+
+    if let Some(options) = &data.typo_options {
+        timer.add("start correcting query");
+        match parsed_query {
+            ParsedQueryTypes::Single(ref mut query) => {
+                corrected_query =
+                    correct_query(query.query.clone(), dataset.id, redis_pool, options).await?;
+                query.query = corrected_query.clone().unwrap_or(query.query.clone());
+                data.query = QueryTypes::Single(query.query.clone());
+            }
+            ParsedQueryTypes::Multi(ref mut queries) => {
+                for (query, _) in queries {
+                    corrected_query =
+                        correct_query(query.query.clone(), dataset.id, redis_pool.clone(), options)
+                            .await?;
+                    query.query = corrected_query.clone().unwrap_or(query.query.clone());
+                }
+            }
+        }
+        timer.add("corrected query");
+    }
+
     timer.add("start to create dense embedding vector");
 
     let embedding_vector = get_qdrant_vector(
@@ -2400,15 +2523,17 @@ pub async fn semantic_search_over_groups(
     timer.add("fetched from postgres");
 
     //TODO: rerank for groups
+    result_chunks.corrected_query = corrected_query;
 
     Ok(result_chunks)
 }
 
-#[tracing::instrument(skip(timer, pool))]
+#[tracing::instrument(skip(timer, pool, redis_pool))]
 pub async fn full_text_search_over_groups(
-    data: SearchOverGroupsReqPayload,
+    mut data: SearchOverGroupsReqPayload,
     parsed_query: ParsedQueryTypes,
     pool: web::Data<Pool>,
+    redis_pool: web::Data<RedisPool>,
     dataset: Dataset,
     config: &DatasetConfiguration,
     timer: &mut Timer,
@@ -2424,6 +2549,30 @@ pub async fn full_text_search_over_groups(
 
     timer.add("computed sparse vector");
 
+    let mut parsed_query = parsed_query.clone();
+    let mut corrected_query = None;
+
+    if let Some(options) = &data.typo_options {
+        timer.add("start correcting query");
+        match parsed_query {
+            ParsedQueryTypes::Single(ref mut query) => {
+                corrected_query =
+                    correct_query(query.query.clone(), dataset.id, redis_pool, options).await?;
+                query.query = corrected_query.clone().unwrap_or(query.query.clone());
+                data.query = QueryTypes::Single(query.query.clone());
+            }
+            ParsedQueryTypes::Multi(ref mut queries) => {
+                for (query, _) in queries {
+                    corrected_query =
+                        correct_query(query.query.clone(), dataset.id, redis_pool.clone(), options)
+                            .await?;
+                    query.query = corrected_query.clone().unwrap_or(query.query.clone());
+                }
+            }
+        }
+        timer.add("corrected query");
+    }
+
     let search_over_groups_qdrant_result = retrieve_group_qdrant_points_query(
         embedding_vector,
         data.page.unwrap_or(1),
@@ -2463,6 +2612,7 @@ pub async fn full_text_search_over_groups(
     timer.add("fetched from postgres");
 
     //TODO: rerank for groups
+    result_groups_with_chunk_hits.corrected_query = corrected_query;
 
     Ok(result_groups_with_chunk_hits)
 }
@@ -2527,17 +2677,33 @@ async fn cross_encoder_for_groups(
     Ok(group_results)
 }
 
-#[tracing::instrument(skip(timer, pool))]
+#[tracing::instrument(skip(timer, pool, redis_pool))]
 pub async fn hybrid_search_over_groups(
-    data: SearchOverGroupsReqPayload,
+    mut data: SearchOverGroupsReqPayload,
     parsed_query: ParsedQuery,
     pool: web::Data<Pool>,
+    redis_pool: web::Data<RedisPool>,
     dataset: Dataset,
     config: &DatasetConfiguration,
     timer: &mut Timer,
 ) -> Result<DeprecatedSearchOverGroupsResponseBody, actix_web::Error> {
     let dataset_config = DatasetConfiguration::from_json(dataset.server_configuration.clone());
 
+    let mut parsed_query = parsed_query.clone();
+    let mut corrected_query = None;
+
+    if let Some(options) = &data.typo_options {
+        timer.add("start correcting query");
+        corrected_query =
+            correct_query(parsed_query.query.clone(), dataset.id, redis_pool, options).await?;
+        parsed_query.query = corrected_query
+            .clone()
+            .unwrap_or(parsed_query.query.clone());
+        data.query = QueryTypes::Single(parsed_query.query.clone());
+
+        timer.add("corrected query");
+    }
+
     timer.add("start to create dense embedding vector and sparse vector");
 
     let dense_embedding_vectors_future = get_dense_vector(
@@ -2602,6 +2768,7 @@ pub async fn hybrid_search_over_groups(
 
     let combined_search_chunk_query_results = SearchOverGroupsQueryResult {
         search_results: combined_results,
+        corrected_query: None,
         total_chunk_pages: semantic_results.total_chunk_pages,
     };
 
@@ -2662,6 +2829,7 @@ pub async fn hybrid_search_over_groups(
 
     let result_chunks = DeprecatedSearchOverGroupsResponseBody {
         group_chunks: reranked_chunks,
+        corrected_query,
         total_chunk_pages: combined_search_chunk_query_results.total_chunk_pages,
     };
 
@@ -2670,16 +2838,33 @@ pub async fn hybrid_search_over_groups(
     Ok(result_chunks)
 }
 
-#[tracing::instrument(skip(timer, pool))]
+#[tracing::instrument(skip(timer, pool, redis_pool))]
 pub async fn autocomplete_chunks_query(
-    data: AutocompleteReqPayload,
+    mut data: AutocompleteReqPayload,
     parsed_query: ParsedQuery,
     pool: web::Data<Pool>,
+    redis_pool: web::Data<RedisPool>,
     dataset: Dataset,
     config: &DatasetConfiguration,
     timer: &mut Timer,
 ) -> Result<SearchChunkQueryResponseBody, actix_web::Error> {
     let parent_span = sentry::configure_scope(|scope| scope.get_span());
+
+    let mut parsed_query = parsed_query.clone();
+    let mut corrected_query = None;
+
+    if let Some(options) = &data.typo_options {
+        timer.add("start correcting query");
+        corrected_query =
+            correct_query(parsed_query.query.clone(), dataset.id, redis_pool, options).await?;
+        parsed_query.query = corrected_query
+            .clone()
+            .unwrap_or(parsed_query.query.clone());
+        data.query.clone_from(&parsed_query.query);
+
+        timer.add("corrected query");
+    }
+
     let transaction: sentry::TransactionOrSpan = match &parent_span {
         Some(parent) => parent
             .start_child("semantic search", "Search Semantic Chunks")
@@ -2814,6 +2999,8 @@ pub async fn autocomplete_chunks_query(
     timer.add("reranking");
     transaction.finish();
 
+    result_chunks.corrected_query = corrected_query;
+
     Ok(result_chunks)
 }
 
diff --git a/server/src/operators/words_operator.rs b/server/src/operators/words_operator.rs
new file mode 100644
index 0000000000..4c424deeab
--- /dev/null
+++ b/server/src/operators/words_operator.rs
@@ -0,0 +1,588 @@
+use std::{
+    collections::{HashMap, HashSet},
+    io::Write,
+    sync::{Arc, Mutex},
+    time::{Duration, Instant},
+};
+
+use crate::{
+    data::models::{RedisPool, TypoOptions, TypoRange},
+    errors::ServiceError,
+};
+use actix_web::web;
+use flate2::{
+    write::{GzDecoder, GzEncoder},
+    Compression,
+};
+use itertools::Itertools;
+use lazy_static::lazy_static;
+use rayon::prelude::*;
+use serde::{Deserialize, Deserializer, Serialize, Serializer};
+use std::collections::VecDeque;
+use tokio::sync::RwLock;
+
+#[derive(Clone, Debug, Eq, PartialEq)]
+struct Node {
+    word: String,
+    count: i32,
+    children: Vec<(isize, Node)>,
+}
+
+/// A BK-tree datastructure
+///
+#[derive(Clone, Debug)]
+pub struct BkTree {
+    root: Option<Box<Node>>,
+}
+
+#[derive(Serialize, Deserialize)]
+struct FlatNode {
+    parent_index: Option<usize>,
+    distance: Option<isize>,
+    word: String,
+    count: i32,
+}
+
+impl Serialize for BkTree {
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: Serializer,
+    {
+        let mut queue = VecDeque::new();
+        let mut flat_tree = Vec::new();
+
+        if let Some(root) = &self.root {
+            queue.push_back((None, None, root.as_ref()));
+        }
+
+        while let Some((parent_index, distance, node)) = queue.pop_front() {
+            let current_index = flat_tree.len();
+            flat_tree.push(FlatNode {
+                parent_index,
+                distance,
+                word: node.word.clone(),
+                count: node.count,
+            });
+
+            for (child_distance, child) in &node.children {
+                queue.push_back((Some(current_index), Some(*child_distance), child));
+            }
+        }
+
+        let binary_data = bincode::serialize(&flat_tree).map_err(serde::ser::Error::custom)?;
+        serializer.serialize_bytes(&binary_data)
+    }
+}
+
+impl<'de> Deserialize<'de> for BkTree {
+    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+    where
+        D: Deserializer<'de>,
+    {
+        let binary_data: Vec<u8> = Vec::deserialize(deserializer)?;
+        let flat_tree: Vec<FlatNode> =
+            bincode::deserialize(&binary_data).map_err(serde::de::Error::custom)?;
+
+        if flat_tree.is_empty() {
+            return Ok(BkTree { root: None });
+        }
+
+        let mut nodes: Vec<Node> = flat_tree
+            .iter()
+            .map(|flat_node| Node {
+                word: flat_node.word.clone(),
+                count: flat_node.count,
+                children: Vec::new(),
+            })
+            .collect();
+
+        // Reconstruct the tree structure
+        for i in (1..nodes.len()).rev() {
+            let parent_index = flat_tree[i].parent_index.unwrap();
+            let distance = flat_tree[i].distance.unwrap();
+            let child = nodes.remove(i);
+            nodes[parent_index].children.push((distance, child));
+        }
+
+        Ok(BkTree {
+            root: Some(Box::new(nodes.remove(0))),
+        })
+    }
+}
+
+impl Default for BkTree {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+pub fn levenshtein_distance<S: AsRef<str>>(a: &S, b: &S) -> isize {
+    let a = a.as_ref().to_lowercase();
+    let b = b.as_ref().to_lowercase();
+
+    if a == b {
+        return 0;
+    }
+
+    let a_len = a.chars().count();
+    let b_len = b.chars().count();
+
+    if a_len == 0 {
+        return b_len as isize;
+    }
+
+    if b_len == 0 {
+        return a_len as isize;
+    }
+
+    let mut res = 0;
+    let mut cache: Vec<usize> = (1..).take(a_len).collect();
+    let mut a_dist;
+    let mut b_dist;
+
+    for (ib, cb) in b.chars().enumerate() {
+        res = ib;
+        a_dist = ib;
+        for (ia, ca) in a.chars().enumerate() {
+            b_dist = if ca == cb { a_dist } else { a_dist + 1 };
+            a_dist = cache[ia];
+
+            res = if a_dist > res {
+                if b_dist > res {
+                    res + 1
+                } else {
+                    b_dist
+                }
+            } else if b_dist > a_dist {
+                a_dist + 1
+            } else {
+                b_dist
+            };
+
+            cache[ia] = res;
+        }
+    }
+
+    res as isize
+}
+
+impl BkTree {
+    /// Create a new BK-tree
+    pub fn new() -> Self {
+        Self { root: None }
+    }
+
+    /// Insert every element from a given iterator in the BK-tree
+    pub fn insert_all<I: IntoIterator<Item = (String, i32)>>(&mut self, iter: I) {
+        for i in iter {
+            self.insert(i);
+        }
+    }
+
+    /// Insert a new element in the BK-tree
+    pub fn insert(&mut self, val: (String, i32)) {
+        match self.root {
+            None => {
+                self.root = Some(Box::new(Node {
+                    word: val.0,
+                    count: val.1,
+                    children: Vec::new(),
+                }))
+            }
+            Some(ref mut root_node) => {
+                let mut u = &mut **root_node;
+                loop {
+                    let k = levenshtein_distance(&u.word, &val.0);
+                    if k == 0 {
+                        u.count = val.1;
+                        return;
+                    }
+
+                    if val.1 == 1 {
+                        return;
+                    }
+
+                    let v = u.children.iter().position(|(dist, _)| *dist == k);
+                    match v {
+                        None => {
+                            u.children.push((
+                                k,
+                                Node {
+                                    word: val.0,
+                                    count: val.1,
+                                    children: Vec::new(),
+                                },
+                            ));
+                            return;
+                        }
+                        Some(pos) => {
+                            let (_, ref mut vnode) = u.children[pos];
+                            u = vnode;
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    /// Find the closest elements to a given value present in the BK-tree
+    ///
+    /// Returns pairs of element references and distances
+    pub fn find(&self, val: String, max_dist: isize) -> Vec<((&String, &i32), isize)> {
+        match self.root {
+            None => Vec::new(),
+            Some(ref root) => {
+                let found = Arc::new(Mutex::new(Vec::new()));
+                let mut candidates: Vec<&Node> = vec![root];
+
+                while !candidates.is_empty() {
+                    let next_candidates: Vec<&Node> = if candidates.len() > 1000 {
+                        candidates
+                            .par_iter()
+                            .flat_map(|&n| {
+                                let distance = levenshtein_distance(&n.word, &val);
+                                let mut local_candidates = Vec::new();
+
+                                if distance <= max_dist {
+                                    found.lock().unwrap().push(((&n.word, &n.count), distance));
+                                }
+
+                                for (arc, node) in &n.children {
+                                    if (*arc - distance).abs() <= max_dist {
+                                        local_candidates.push(node);
+                                    }
+                                }
+
+                                local_candidates
+                            })
+                            .collect()
+                    } else {
+                        candidates
+                            .iter()
+                            .flat_map(|&n| {
+                                let distance = levenshtein_distance(&n.word, &val);
+                                if distance <= max_dist {
+                                    found.lock().unwrap().push(((&n.word, &n.count), distance));
+                                }
+                                n.children
+                                    .iter()
+                                    .filter(|(arc, _)| (*arc - distance).abs() <= max_dist)
+                                    .map(|(_, node)| node)
+                                    .collect::<Vec<_>>()
+                            })
+                            .collect()
+                    };
+
+                    candidates = next_candidates;
+                }
+
+                let mut result = Arc::try_unwrap(found).unwrap().into_inner().unwrap();
+                result.sort_by_key(|&(_, dist)| dist);
+                result
+            }
+        }
+    }
+
+    /// Create an iterator over references of BK-tree elements, in no particular order
+    pub fn iter(&self) -> Iter {
+        let mut queue = Vec::new();
+        if let Some(ref root) = self.root {
+            queue.push(&**root);
+        }
+        Iter { queue }
+    }
+
+    pub async fn from_redis(
+        dataset_id: uuid::Uuid,
+        redis_pool: web::Data<RedisPool>,
+    ) -> Result<Option<Self>, ServiceError> {
+        let mut redis_conn = redis_pool.get().await.map_err(|_| {
+            ServiceError::InternalServerError("Failed to get redis connection".to_string())
+        })?;
+
+        let compressed_bk_tree: Option<Vec<u8>> = redis::cmd("GET")
+            .arg(format!("bk_tree_{}", dataset_id))
+            .query_async(&mut *redis_conn)
+            .await
+            .map_err(|err| ServiceError::BadRequest(err.to_string()))?;
+
+        if let Some(compressed_bk_tree) = compressed_bk_tree {
+            let buf = Vec::new();
+            let mut decoder = GzDecoder::new(buf);
+            decoder.write_all(&compressed_bk_tree).map_err(|err| {
+                ServiceError::InternalServerError(format!("Failed to decompress bk tree {}", err))
+            })?;
+
+            let serialized_bk_tree = decoder.finish().map_err(|err| {
+                ServiceError::InternalServerError(format!(
+                    "Failed to finish decompressing bk tree {}",
+                    err
+                ))
+            })?;
+
+            let tree = bincode::deserialize(&serialized_bk_tree).map_err(|err| {
+                ServiceError::InternalServerError(format!("Failed to deserialize bk tree {}", err))
+            })?;
+
+            Ok(Some(tree))
+        } else {
+            Ok(None)
+        }
+    }
+
+    pub async fn save(
+        &self,
+        dataset_id: uuid::Uuid,
+        redis_pool: web::Data<RedisPool>,
+    ) -> Result<(), ServiceError> {
+        if self.root.is_none() {
+            return Ok(());
+        }
+        let mut redis_conn = redis_pool.get().await.map_err(|_| {
+            ServiceError::InternalServerError("Failed to get redis connection".to_string())
+        })?;
+
+        let uncompressed_bk_tree = bincode::serialize(self).map_err(|_| {
+            ServiceError::InternalServerError("Failed to serialize bk tree".to_string())
+        })?;
+
+        let mut encoder = GzEncoder::new(Vec::new(), Compression::default());
+        encoder.write_all(&uncompressed_bk_tree).map_err(|_| {
+            ServiceError::InternalServerError("Failed to compress bk tree".to_string())
+        })?;
+
+        let serialized_bk_tree = encoder.finish().map_err(|_| {
+            ServiceError::InternalServerError("Failed to finish compressing bk tree".to_string())
+        })?;
+
+        redis::cmd("SET")
+            .arg(format!("bk_tree_{}", dataset_id))
+            .arg(serialized_bk_tree)
+            .query_async(&mut *redis_conn)
+            .await
+            .map_err(|err| ServiceError::BadRequest(err.to_string()))?;
+
+        Ok(())
+    }
+}
+
+/// Iterator over BK-tree elements
+pub struct IntoIter {
+    queue: Vec<Node>,
+}
+
+impl Iterator for IntoIter {
+    type Item = (String, i32);
+    fn next(&mut self) -> Option<Self::Item> {
+        self.queue.pop().map(|node| {
+            self.queue.extend(node.children.into_iter().map(|(_, n)| n));
+            (node.word, node.count)
+        })
+    }
+}
+
+/// Iterator over BK-tree elements, by reference
+pub struct Iter<'a> {
+    queue: Vec<&'a Node>,
+}
+
+impl<'a> Iterator for Iter<'a> {
+    type Item = (&'a String, &'a i32);
+    fn next(&mut self) -> Option<Self::Item> {
+        self.queue.pop().map(|node| {
+            self.queue.extend(node.children.iter().map(|(_, n)| n));
+            (&node.word, &node.count)
+        })
+    }
+}
+
+#[derive(Serialize, Deserialize, Clone, Debug)]
+pub struct ProcessWordsFromDatasetMessage {
+    pub chunks_to_process: Vec<(uuid::Uuid, uuid::Uuid)>, // chunk_id, dataset_id
+    pub attempt_number: usize,
+}
+
+#[derive(Serialize, Deserialize, Clone)]
+pub struct CreateBkTreeMessage {
+    pub dataset_id: uuid::Uuid,
+    pub attempt_number: usize,
+}
+
+struct BKTreeCacheEntry {
+    bktree: BkTree,
+    expiration: Instant,
+}
+
+pub struct BKTreeCache {
+    cache: RwLock<HashMap<uuid::Uuid, BKTreeCacheEntry>>,
+}
+
+lazy_static! {
+    static ref BKTREE_CACHE: BKTreeCache = BKTreeCache::new();
+}
+
+impl BKTreeCache {
+    fn new() -> Self {
+        Self {
+            cache: RwLock::new(HashMap::new()),
+        }
+    }
+
+    fn insert_with_ttl(&self, id: uuid::Uuid, bktree: BkTree, ttl: Duration) {
+        if let Ok(mut cache) = self.cache.try_write() {
+            let entry = BKTreeCacheEntry {
+                bktree,
+                expiration: Instant::now() + ttl,
+            };
+            cache.insert(id, entry);
+        };
+    }
+
+    fn get_if_valid(&self, id: &uuid::Uuid) -> Option<BkTree> {
+        match self.cache.try_read() {
+            Ok(cache) => cache.get(id).and_then(|entry| {
+                if Instant::now() < entry.expiration {
+                    Some(entry.bktree.clone())
+                } else {
+                    None
+                }
+            }),
+            _ => None,
+        }
+    }
+
+    fn remove_expired(&self) {
+        if let Ok(mut cache) = self.cache.try_write() {
+            cache.retain(|_, entry| Instant::now() < entry.expiration);
+        }
+    }
+
+    pub fn enforce_cache_ttl() {
+        tokio::spawn(async move {
+            let mut interval = tokio::time::interval(Duration::from_secs(60)); // Run every 60 seconds
+
+            loop {
+                interval.tick().await;
+                BKTREE_CACHE.remove_expired();
+            }
+        });
+    }
+}
+
+fn correct_query_helper(tree: &BkTree, query: String, options: &TypoOptions) -> Option<String> {
+    let query_split_by_whitespace = query
+        .split_whitespace()
+        .map(|s| s.to_string())
+        .collect_vec();
+    let mut query_split_to_correction: HashMap<String, String> = HashMap::new();
+    let excluded_words = options
+        .clone()
+        .disable_on_word
+        .unwrap_or_default()
+        .into_iter()
+        .map(|s| s.to_lowercase())
+        .collect::<HashSet<String>>();
+
+    for split in &query_split_by_whitespace {
+        if excluded_words.contains(&split.to_lowercase()) {
+            continue;
+        }
+
+        let exact_match = tree.find(split.to_string(), 0);
+
+        if !exact_match.is_empty() {
+            continue;
+        }
+
+        let mut corrections = vec![];
+
+        let num_chars = split.chars().collect_vec().len();
+
+        let single_typo_range = options.clone().one_typo_word_range.unwrap_or(TypoRange {
+            min: 5,
+            max: Some(8),
+        });
+
+        if num_chars >= (single_typo_range.min as usize)
+            && num_chars <= (single_typo_range.max.unwrap_or(u32::MAX) as usize)
+        {
+            corrections.extend_from_slice(&tree.find(split.to_string(), 1));
+        }
+
+        let two_typo_range = options
+            .clone()
+            .two_typo_word_range
+            .unwrap_or(TypoRange { min: 8, max: None });
+
+        if num_chars >= (two_typo_range.min as usize)
+            && num_chars <= (two_typo_range.max.unwrap_or(u32::MAX) as usize)
+        {
+            corrections.extend_from_slice(&tree.find(split.to_string(), 2));
+        }
+
+        corrections.sort_by(|((_, freq_a), _), ((_, freq_b), _)| (**freq_b).cmp(*freq_a));
+
+        if let Some(((correction, _), _)) = corrections.get(0) {
+            query_split_to_correction.insert(split.to_string(), correction.to_string());
+        }
+    }
+
+    let mut corrected_query = query.clone();
+
+    if !query_split_to_correction.is_empty() {
+        for (og_string, correction) in query_split_to_correction {
+            corrected_query = corrected_query.replacen(&og_string, &correction, 1);
+        }
+        Some(corrected_query)
+    } else {
+        None
+    }
+}
+
+#[tracing::instrument(skip(redis_pool))]
+pub async fn correct_query(
+    query: String,
+    dataset_id: uuid::Uuid,
+    redis_pool: web::Data<RedisPool>,
+    options: &TypoOptions,
+) -> Result<Option<String>, ServiceError> {
+    if matches!(options.correct_typos, None | Some(false)) {
+        return Ok(None);
+    }
+
+    match BKTREE_CACHE.get_if_valid(&dataset_id) {
+        Some(tree) => Ok(correct_query_helper(&tree, query, options)),
+        None => {
+            let dataset_id = dataset_id;
+            let redis_pool = redis_pool.clone();
+            log::info!("Pulling new BK tree from Redis");
+            tokio::spawn(async move {
+                match BkTree::from_redis(dataset_id, redis_pool).await {
+                    // TTL of 1 day
+                    Ok(Some(bktree)) => {
+                        BKTREE_CACHE.insert_with_ttl(
+                            dataset_id,
+                            bktree,
+                            Duration::from_secs(60 * 60 * 24),
+                        );
+                        log::info!(
+                            "Inserted new BK tree into cache for dataset_id: {:?}",
+                            dataset_id
+                        );
+                    }
+                    Ok(None) => {
+                        log::info!("No BK tree found in Redis for dataset_id: {:?}", dataset_id);
+                    }
+                    Err(e) => {
+                        log::info!(
+                            "Failed to insert new BK tree into cache {:?} for dataset_id: {:?}",
+                            e,
+                            dataset_id
+                        );
+                    }
+                };
+            });
+            Ok(None)
+        }
+    }
+}