Skip to content

Commit

Permalink
[ML] File upload: Adds support for PDF files (#186956)
Browse files Browse the repository at this point in the history
Also txt, rtf, doc, docx, xls, xlsx, ppt, pptx, odt, ods, and odp.

Adds the ability to automatically add a semantic text field to the
mappings and a `copy_to` processor to duplicate the field. This is
needed for the mappings generated for the attachment processor which
adds a nested `attachment.content` field which cannot be used as a
semantic text field.

After a successful import, a link to Search's Playground app is shown.
Navigating there lets the user instantly query the newly uploaded file.


https://github.com/user-attachments/assets/09b20a5f-0e02-47fa-885e-0ed21374cc60

---------

Co-authored-by: kibanamachine <[email protected]>
Co-authored-by: Liam Thompson <[email protected]>
  • Loading branch information
3 people authored Aug 22, 2024
1 parent c5b38e4 commit 3177b03
Show file tree
Hide file tree
Showing 50 changed files with 1,657 additions and 506 deletions.
2 changes: 1 addition & 1 deletion x-pack/plugins/data_visualizer/common/constants.ts
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ export const FILE_FORMATS = {
DELIMITED: 'delimited',
NDJSON: 'ndjson',
SEMI_STRUCTURED_TEXT: 'semi_structured_text',
// XML: 'xml',
TIKA: 'tika',
};

export const SUPPORTED_FIELD_TYPES = {
Expand Down
90 changes: 90 additions & 0 deletions x-pack/plugins/data_visualizer/common/utils/tika_utils.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/

import { i18n } from '@kbn/i18n';

export function isTikaType(type: string) {
return getTikaDisplayType(type).isTikaType;
}

export const getTikaDisplayType = (type: string): { isTikaType: boolean; label: string } => {
switch (type) {
case 'application/doc':
case 'application/ms-doc':
case 'application/msword':
case 'application/vnd.openxmlformats-officedocument.wordprocessingml.document':
return {
isTikaType: true,
label: i18n.translate('xpack.dataVisualizer.file.tikaTypes.word', {
defaultMessage: 'Microsoft Office Word document',
}),
};

case 'application/excel':
case 'application/vnd.ms-excel':
case 'application/x-excel':
case 'application/x-msexcel':
case 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet':
return {
isTikaType: true,
label: i18n.translate('xpack.dataVisualizer.file.tikaTypes.excel', {
defaultMessage: 'Microsoft Office Excel document',
}),
};

case 'application/mspowerpoint':
case 'application/powerpoint':
case 'application/vnd.ms-powerpoint':
case 'application/x-mspowerpoint':
case 'application/vnd.openxmlformats-officedocument.presentationml.presentation':
return {
isTikaType: true,
label: i18n.translate('xpack.dataVisualizer.file.tikaTypes.powerPoint', {
defaultMessage: 'Microsoft Office Power Point document',
}),
};

case 'application/vnd.oasis.opendocument.presentation':
case 'application/vnd.oasis.opendocument.spreadsheet':
case 'application/vnd.oasis.opendocument.text':
return {
isTikaType: true,
label: i18n.translate('xpack.dataVisualizer.file.tikaTypes.openDoc', {
defaultMessage: 'Open Document Format',
}),
};

case 'text/rtf':
case 'application/rtf':
return {
isTikaType: true,
label: i18n.translate('xpack.dataVisualizer.file.tikaTypes.richText', {
defaultMessage: 'Rich Text Format',
}),
};

case 'application/pdf':
return {
isTikaType: true,
label: i18n.translate('xpack.dataVisualizer.file.tikaTypes.pdf', {
defaultMessage: 'PDF',
}),
};

case 'text/plain':
case 'text/plain; charset=UTF-8':
return {
isTikaType: true,
label: i18n.translate('xpack.dataVisualizer.file.tikaTypes.plainText', {
defaultMessage: 'Plain text',
}),
};

default:
return { isTikaType: false, label: type };
}
};
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,19 @@
import React from 'react';
import { EuiText } from '@elastic/eui';

import { ES_FIELD_TYPES } from '@kbn/field-types';
import type { CombinedField } from './types';

export function CombinedFieldLabel({ combinedField }: { combinedField: CombinedField }) {
return <EuiText size="s">{getCombinedFieldLabel(combinedField)}</EuiText>;
}

function getCombinedFieldLabel(combinedField: CombinedField) {
return `${combinedField.fieldNames.join(combinedField.delimiter)} => ${
combinedField.combinedFieldName
} (${combinedField.mappingType})`;
if (combinedField.mappingType === ES_FIELD_TYPES.GEO_POINT) {
return `${combinedField.fieldNames.join(combinedField.delimiter)} => ${
combinedField.combinedFieldName
} (${combinedField.mappingType})`;
}

return combinedField.combinedFieldName;
}
Original file line number Diff line number Diff line change
Expand Up @@ -19,17 +19,13 @@ import {
EuiFlexItem,
} from '@elastic/eui';

import type { FindFileStructureResponse } from '@kbn/file-upload-plugin/common';
import type { FindFileStructureResponse, IngestPipeline } from '@kbn/file-upload-plugin/common';
import type { MappingTypeMapping } from '@elastic/elasticsearch/lib/api/typesWithBodyKey';
import type { CombinedField } from './types';
import { GeoPointForm } from './geo_point';
import { SemanticTextForm } from './semantic_text';
import { CombinedFieldLabel } from './combined_field_label';
import {
addCombinedFieldsToMappings,
addCombinedFieldsToPipeline,
getNameCollisionMsg,
removeCombinedFieldsFromMappings,
removeCombinedFieldsFromPipeline,
} from './utils';
import { removeCombinedFieldsFromMappings, removeCombinedFieldsFromPipeline } from './utils';

interface Props {
mappingsString: string;
Expand All @@ -46,6 +42,12 @@ interface State {
isPopoverOpen: boolean;
}

export type AddCombinedField = (
combinedField: CombinedField,
addToMappings: (mappings: MappingTypeMapping) => MappingTypeMapping,
addToPipeline: (pipeline: IngestPipeline) => IngestPipeline
) => void;

export class CombinedFieldsForm extends Component<Props, State> {
state: State = {
isPopoverOpen: false,
Expand All @@ -63,20 +65,20 @@ export class CombinedFieldsForm extends Component<Props, State> {
});
};

addCombinedField = (combinedField: CombinedField) => {
if (this.hasNameCollision(combinedField.combinedFieldName)) {
throw new Error(getNameCollisionMsg(combinedField.combinedFieldName));
}

addCombinedField = (
combinedField: CombinedField,
addToMappings: (mappings: MappingTypeMapping) => {},
addToPipeline: (pipeline: IngestPipeline) => {}
) => {
const mappings = this.parseMappings();
const pipeline = this.parsePipeline();

this.props.onMappingsStringChange(
JSON.stringify(addCombinedFieldsToMappings(mappings, [combinedField]), null, 2)
);
this.props.onPipelineStringChange(
JSON.stringify(addCombinedFieldsToPipeline(pipeline, [combinedField]), null, 2)
);
const newMappings = addToMappings(mappings);
const newPipeline = addToPipeline(pipeline);

this.props.onMappingsStringChange(JSON.stringify(newMappings, null, 2));
this.props.onPipelineStringChange(JSON.stringify(newPipeline, null, 2));

this.props.onCombinedFieldsChange([...this.props.combinedFields, combinedField]);

this.closePopover();
Expand Down Expand Up @@ -155,6 +157,13 @@ export class CombinedFieldsForm extends Component<Props, State> {
defaultMessage: 'Add geo point field',
}
);

const semanticTextLabel = i18n.translate(
'xpack.dataVisualizer.file.semanticTextForm.combinedFieldLabel',
{
defaultMessage: 'Add semantic text field',
}
);
const panels = [
{
id: 0,
Expand All @@ -163,6 +172,10 @@ export class CombinedFieldsForm extends Component<Props, State> {
name: geoPointLabel,
panel: 1,
},
{
name: semanticTextLabel,
panel: 2,
},
],
},
{
Expand All @@ -176,11 +189,22 @@ export class CombinedFieldsForm extends Component<Props, State> {
/>
),
},
{
id: 2,
title: semanticTextLabel,
content: (
<SemanticTextForm
addCombinedField={this.addCombinedField}
hasNameCollision={this.hasNameCollision}
results={this.props.results}
/>
),
},
];
return (
<EuiFormRow
label={i18n.translate('xpack.dataVisualizer.combinedFieldsLabel', {
defaultMessage: 'Combined fields',
defaultMessage: 'Automatically created fields',
})}
>
<div>
Expand Down Expand Up @@ -217,7 +241,7 @@ export class CombinedFieldsForm extends Component<Props, State> {
>
<FormattedMessage
id="xpack.dataVisualizer.addCombinedFieldsLabel"
defaultMessage="Add combined field"
defaultMessage="Add additional field"
/>
</EuiButtonEmpty>
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,17 +23,19 @@ import {
} from '@elastic/eui';

import type { FindFileStructureResponse } from '@kbn/file-upload-plugin/common';
import type { CombinedField } from './types';
import {
createGeoPointCombinedField,
isWithinLatRange,
isWithinLonRange,
getFieldNames,
getNameCollisionMsg,
addCombinedFieldsToMappings,
addCombinedFieldsToPipeline,
} from './utils';
import type { AddCombinedField } from './combined_fields_form';

interface Props {
addCombinedField: (combinedField: CombinedField) => void;
addCombinedField: AddCombinedField;
hasNameCollision: (name: string) => boolean;
results: FindFileStructureResponse;
}
Expand Down Expand Up @@ -99,13 +101,18 @@ export class GeoPointForm extends Component<Props, State> {

onSubmit = () => {
try {
const combinedField = createGeoPointCombinedField(
this.state.latField,
this.state.lonField,
this.state.geoPointField
);

this.props.addCombinedField(
createGeoPointCombinedField(
this.state.latField,
this.state.lonField,
this.state.geoPointField
)
combinedField,
(mappings) => addCombinedFieldsToMappings(mappings, [combinedField]),
(pipeline) => addCombinedFieldsToPipeline(pipeline, [combinedField])
);

this.setState({ submitError: '' });
} catch (error) {
this.setState({ submitError: error.message });
Expand Down
Loading

0 comments on commit 3177b03

Please sign in to comment.