-
Notifications
You must be signed in to change notification settings - Fork 8.3k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[ML] File upload: Adds support for PDF files (#186956)
Also txt, rtf, doc, docx, xls, xlsx, ppt, pptx, odt, ods, and odp. Adds the ability to automatically add a semantic text field to the mappings and a `copy_to` processor to duplicate the field. This is needed for the mappings generated for the attachment processor which adds a nested `attachment.content` field which cannot be used as a semantic text field. After a successful import, a link to Search's Playground app is shown. Navigating there lets the user instantly query the newly uploaded file. https://github.com/user-attachments/assets/09b20a5f-0e02-47fa-885e-0ed21374cc60 --------- Co-authored-by: kibanamachine <[email protected]> Co-authored-by: Liam Thompson <[email protected]>
- Loading branch information
1 parent
c5b38e4
commit 3177b03
Showing
50 changed files
with
1,657 additions
and
506 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,90 @@ | ||
/* | ||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one | ||
* or more contributor license agreements. Licensed under the Elastic License | ||
* 2.0; you may not use this file except in compliance with the Elastic License | ||
* 2.0. | ||
*/ | ||
|
||
import { i18n } from '@kbn/i18n'; | ||
|
||
export function isTikaType(type: string) { | ||
return getTikaDisplayType(type).isTikaType; | ||
} | ||
|
||
export const getTikaDisplayType = (type: string): { isTikaType: boolean; label: string } => { | ||
switch (type) { | ||
case 'application/doc': | ||
case 'application/ms-doc': | ||
case 'application/msword': | ||
case 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': | ||
return { | ||
isTikaType: true, | ||
label: i18n.translate('xpack.dataVisualizer.file.tikaTypes.word', { | ||
defaultMessage: 'Microsoft Office Word document', | ||
}), | ||
}; | ||
|
||
case 'application/excel': | ||
case 'application/vnd.ms-excel': | ||
case 'application/x-excel': | ||
case 'application/x-msexcel': | ||
case 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': | ||
return { | ||
isTikaType: true, | ||
label: i18n.translate('xpack.dataVisualizer.file.tikaTypes.excel', { | ||
defaultMessage: 'Microsoft Office Excel document', | ||
}), | ||
}; | ||
|
||
case 'application/mspowerpoint': | ||
case 'application/powerpoint': | ||
case 'application/vnd.ms-powerpoint': | ||
case 'application/x-mspowerpoint': | ||
case 'application/vnd.openxmlformats-officedocument.presentationml.presentation': | ||
return { | ||
isTikaType: true, | ||
label: i18n.translate('xpack.dataVisualizer.file.tikaTypes.powerPoint', { | ||
defaultMessage: 'Microsoft Office Power Point document', | ||
}), | ||
}; | ||
|
||
case 'application/vnd.oasis.opendocument.presentation': | ||
case 'application/vnd.oasis.opendocument.spreadsheet': | ||
case 'application/vnd.oasis.opendocument.text': | ||
return { | ||
isTikaType: true, | ||
label: i18n.translate('xpack.dataVisualizer.file.tikaTypes.openDoc', { | ||
defaultMessage: 'Open Document Format', | ||
}), | ||
}; | ||
|
||
case 'text/rtf': | ||
case 'application/rtf': | ||
return { | ||
isTikaType: true, | ||
label: i18n.translate('xpack.dataVisualizer.file.tikaTypes.richText', { | ||
defaultMessage: 'Rich Text Format', | ||
}), | ||
}; | ||
|
||
case 'application/pdf': | ||
return { | ||
isTikaType: true, | ||
label: i18n.translate('xpack.dataVisualizer.file.tikaTypes.pdf', { | ||
defaultMessage: 'PDF', | ||
}), | ||
}; | ||
|
||
case 'text/plain': | ||
case 'text/plain; charset=UTF-8': | ||
return { | ||
isTikaType: true, | ||
label: i18n.translate('xpack.dataVisualizer.file.tikaTypes.plainText', { | ||
defaultMessage: 'Plain text', | ||
}), | ||
}; | ||
|
||
default: | ||
return { isTikaType: false, label: type }; | ||
} | ||
}; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.