Skip to content

Commit

Permalink
[8.16] [ML] File upload fixing PDF character count limit (#197333) (#…
Browse files Browse the repository at this point in the history
…197601)

# Backport

This will backport the following commits from `main` to `8.16`:
- [[ML] File upload fixing PDF character count limit
(#197333)](#197333)

<!--- Backport version: 9.4.3 -->

### Questions ?
Please refer to the [Backport tool
documentation](https://github.com/sqren/backport)

<!--BACKPORT [{"author":{"name":"James
Gowdy","email":"[email protected]"},"sourceCommit":{"committedDate":"2024-10-24T10:47:58Z","message":"[ML]
File upload fixing PDF character count limit (#197333)\n\nThe default
character limit for the attachment processor is 100000\r\ncharacters.
This limit is useful when previewing the contents of the\r\nfile, but
should not be applied when ingesting the file.\r\n\r\nThis PR changes
the ingest character limit to be unlimited (-1) for\r\ningest and
displays the character limit instead of the line limit for\r\nthe
document
preview.\r\n\r\n\r\n![image](https://github.com/user-attachments/assets/1c0cf324-a2b8-452c-b504-7c5b2935ba1c)","sha":"9aa67ef45596080f742166f1c63e2c8f9a44f100","branchLabelMapping":{"^v9.0.0$":"main","^v8.17.0$":"8.x","^v(\\d+).(\\d+).\\d+$":"$1.$2"}},"sourcePullRequest":{"labels":["release_note:fix",":ml","Feature:File
and Index Data Viz","Feature:File
Upload","v9.0.0","v8.16.0","backport:version","v8.17.0"],"title":"[ML]
File upload fixing PDF character count
limit","number":197333,"url":"https://github.com/elastic/kibana/pull/197333","mergeCommit":{"message":"[ML]
File upload fixing PDF character count limit (#197333)\n\nThe default
character limit for the attachment processor is 100000\r\ncharacters.
This limit is useful when previewing the contents of the\r\nfile, but
should not be applied when ingesting the file.\r\n\r\nThis PR changes
the ingest character limit to be unlimited (-1) for\r\ningest and
displays the character limit instead of the line limit for\r\nthe
document
preview.\r\n\r\n\r\n![image](https://github.com/user-attachments/assets/1c0cf324-a2b8-452c-b504-7c5b2935ba1c)","sha":"9aa67ef45596080f742166f1c63e2c8f9a44f100"}},"sourceBranch":"main","suggestedTargetBranches":["8.16","8.x"],"targetPullRequestStates":[{"branch":"main","label":"v9.0.0","branchLabelMappingKey":"^v9.0.0$","isSourceBranch":true,"state":"MERGED","url":"https://github.com/elastic/kibana/pull/197333","number":197333,"mergeCommit":{"message":"[ML]
File upload fixing PDF character count limit (#197333)\n\nThe default
character limit for the attachment processor is 100000\r\ncharacters.
This limit is useful when previewing the contents of the\r\nfile, but
should not be applied when ingesting the file.\r\n\r\nThis PR changes
the ingest character limit to be unlimited (-1) for\r\ningest and
displays the character limit instead of the line limit for\r\nthe
document
preview.\r\n\r\n\r\n![image](https://github.com/user-attachments/assets/1c0cf324-a2b8-452c-b504-7c5b2935ba1c)","sha":"9aa67ef45596080f742166f1c63e2c8f9a44f100"}},{"branch":"8.16","label":"v8.16.0","branchLabelMappingKey":"^v(\\d+).(\\d+).\\d+$","isSourceBranch":false,"state":"NOT_CREATED"},{"branch":"8.x","label":"v8.17.0","branchLabelMappingKey":"^v8.17.0$","isSourceBranch":false,"state":"NOT_CREATED"}]}]
BACKPORT-->

Co-authored-by: James Gowdy <[email protected]>
  • Loading branch information
kibanamachine and jgowdyelastic authored Oct 24, 2024
1 parent b254359 commit 85159e3
Show file tree
Hide file tree
Showing 5 changed files with 27 additions and 8 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,10 @@ import {
EuiSwitch,
} from '@elastic/eui';

import type { FindFileStructureResponse } from '@kbn/file-upload-plugin/common';
import { TIKA_PREVIEW_CHARS, type FindFileStructureResponse } from '@kbn/file-upload-plugin/common';
import useMountedState from 'react-use/lib/useMountedState';
import { i18n } from '@kbn/i18n';
import { FILE_FORMATS } from '../../../../../common/constants';
import { EDITOR_MODE, JsonEditor } from '../json_editor';
import { useGrokHighlighter } from './use_text_parser';
import { LINE_LIMIT } from './grok_highlighter';
Expand Down Expand Up @@ -132,13 +133,23 @@ export const FileContents: FC<Props> = ({

<EuiSpacer size="s" />

<FormattedMessage
id="xpack.dataVisualizer.file.fileContents.firstLinesDescription"
defaultMessage="First {numberOfLines, plural, zero {# line} one {# line} other {# lines}}"
values={{
numberOfLines: showHighlights ? LINE_LIMIT : numberOfLines,
}}
/>
{format === FILE_FORMATS.TIKA ? (
<FormattedMessage
id="xpack.dataVisualizer.file.fileContents.characterCount"
defaultMessage="Preview limited to the first {numberOfChars} characters"
values={{
numberOfChars: TIKA_PREVIEW_CHARS,
}}
/>
) : (
<FormattedMessage
id="xpack.dataVisualizer.file.fileContents.firstLinesDescription"
defaultMessage="First {numberOfLines, plural, zero {# line} one {# line} other {# lines}}"
values={{
numberOfLines: showHighlights ? LINE_LIMIT : numberOfLines,
}}
/>
)}

<EuiSpacer size="s" />

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,8 @@ export async function analyzeTikaFile(
attachment: {
field: 'data',
remove_binary: true,
// unlimited character count
indexed_chars: -1,
},
},
],
Expand Down
2 changes: 2 additions & 0 deletions x-pack/plugins/file_upload/common/constants.ts
Original file line number Diff line number Diff line change
Expand Up @@ -23,3 +23,5 @@ export const FILE_FORMATS = {
SEMI_STRUCTURED_TEXT: 'semi_structured_text',
TIKA: 'tika',
};

export const TIKA_PREVIEW_CHARS = 100000;
2 changes: 2 additions & 0 deletions x-pack/plugins/file_upload/common/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,5 @@ export type {
InputOverrides,
IngestPipeline,
} from './types';

export { TIKA_PREVIEW_CHARS } from './constants';
2 changes: 2 additions & 0 deletions x-pack/plugins/file_upload/server/preview_tika_contents.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

import type { IScopedClusterClient } from '@kbn/core/server';
import type { PreviewTikaResponse } from '../common/types';
import { TIKA_PREVIEW_CHARS } from '../common/constants';

/**
* Returns the contents of a file using the attachment ingest processor
Expand All @@ -24,6 +25,7 @@ export async function previewTikaContents(
attachment: {
field: 'data',
remove_binary: true,
indexed_chars: TIKA_PREVIEW_CHARS,
},
},
],
Expand Down

0 comments on commit 85159e3

Please sign in to comment.