Skip to content

Commit

Permalink
Merge pull request #320 from CambioML/feat/keyvalue
Browse files Browse the repository at this point in the history
feat: Key-value extraction
  • Loading branch information
lingjiekong authored Dec 31, 2024
2 parents 9f7a306 + f78c09c commit eb678e7
Show file tree
Hide file tree
Showing 21 changed files with 1,193 additions and 41 deletions.
9 changes: 8 additions & 1 deletion .github/pull_request_template.md
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
## Description

<!-- Provide a brief description of the changes introduced by this PR -->

## Related Issue

<!-- Link to the related issue (if applicable) using #issue_number -->

## Type of Change

<!-- Put an `x` in the boxes that apply -->

- [ ] Bug fix (non-breaking change which fixes an issue)
Expand All @@ -15,12 +18,15 @@
- [ ] Performance improvement

## How Has This Been Tested?

<!-- Describe the tests you ran to verify your changes -->

## Screenshots (if applicable)

<!-- Add screenshots to help explain your changes -->

## Checklist

<!-- Put an `x` in the boxes that apply -->

- [ ] My code follows the project's style guidelines
Expand All @@ -32,4 +38,5 @@
- [ ] New and existing unit tests pass locally with my changes

## Additional Notes
<!-- Add any additional notes or context about the PR here -->

<!-- Add any additional notes or context about the PR here -->
3 changes: 3 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,7 @@
"editor.codeActionsOnSave": {
"source.fixAll.eslint": "explicit"
},
"cSpell.words": [
"cambio"
],
}
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
# Cambio Website Source

For Dev setup, please refer to this [Notion Page](https://www.notion.so/goldpiggy/TS-JS-Dev-Setup-ada0f7cdf74c424c8767ed692150cc88?pvs=4). You may need to request access, and you will only receive access if appropriate.

## Quick setup

- checkout a dev branch
- copy .env from 1password
- `npm install` for dependency
- `npm run dev` for local development and testing
- open `http://localhost:3000` in your browser
-
-
1 change: 1 addition & 0 deletions app/actions/apiInterface.ts
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ export interface JobParams {
vqaPageNumsFlag?: boolean;
vqaTableOnlyFlag?: boolean;
vqaChartOnlyFlag?: boolean;
vqaExtractInstruction?: Record<string, string>;
};
schemaInfo?: {
dbSchema?: string[];
Expand Down
41 changes: 41 additions & 0 deletions app/actions/runSyncExtractKeyValue.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import axios from 'axios';

interface IParams {
token: string;
apiUrl: string;
base64String: string;
extractInstruction: Record<string, string>;
fileType?: string;
}

export const runSyncExtractKeyValue = async ({
token,
apiUrl,
base64String,
extractInstruction,
}: IParams): Promise<string> => {
const extractAPI = `${apiUrl}/extract_key_value`;
const params = {
file_content: base64String,
file_type: 'pdf',
extract_args: {
extract_instruction: extractInstruction,
},
};

const config = {
headers: {
'x-api-key': '-',
Authorization: token,
},
};

const extractKeyValueResponse = await axios.post(extractAPI, params, config);

if (extractKeyValueResponse.status !== 200) {
throw new Error('Failed to extract key value pairs');
}

const json = extractKeyValueResponse.data.json[0];
return json;
};
2 changes: 2 additions & 0 deletions app/actions/uploadFile.ts
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ interface IParams {
vqaPageNumsFlag?: boolean;
vqaTableOnlyFlag?: boolean;
vqaChartOnlyFlag?: boolean;
extractInstruction?: Record<string, string>;
};
addFilesFormData: (data: PresignedResponse) => void;
}
Expand Down Expand Up @@ -60,6 +61,7 @@ export const uploadFile = async ({
vqa_page_nums_flag: extractArgs.vqaPageNumsFlag,
vqa_table_only_flag: extractArgs.vqaTableOnlyFlag,
vqa_table_only_caption_flag: extractArgs.vqaChartOnlyFlag,
extract_instruction: extractArgs.extractInstruction,
};

const requestBody = {
Expand Down
1 change: 1 addition & 0 deletions app/components/Button.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ const Button = ({
disabled:opacity-70
disabled:cursor-not-allowed
rounded-xl
whitespace-nowrap
hover:bg-neutral-200
hover:text-cambio-gray
transition
Expand Down
8 changes: 5 additions & 3 deletions app/components/playground/ActionContainer.tsx
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
import usePlaygroundStore from '@/app/hooks/usePlaygroundStore';
import LoginComponent from '../auth/Login';
import PlaygroundTab from './PlaygroundTab';
import ExtractContainer from './ExtractContainer';
import { useEffect, useState } from 'react';
import { PlaygroundFile, PlaygroundTabs } from '@/app/types/PlaygroundTypes';
import UploadButton from './UploadButton';
import MapContainer from './table/MapContainer';
import ExtractContainer from './ExtractContainer';
import ExtractKeyValuePairContainer from './ExtractKeyValuePairContainer';

const ActionContainer = () => {
const { loggedIn, selectedFileIndex, files } = usePlaygroundStore();
Expand All @@ -19,7 +20,7 @@ const ActionContainer = () => {

return (
<div className="w-full h-full min-h-[600px] grid grid-rows-[50px_1fr] overflow-hidden">
<div className={`w-full grid grid-cols-2`}>
<div className={`w-full grid grid-cols-3`}>
{Object.values(PlaygroundTabs).map((tab) => (
<PlaygroundTab key={tab} label={tab} />
))}
Expand All @@ -33,11 +34,12 @@ const ActionContainer = () => {
</div>
</div>
) : (
<div className="h-full border border-solid border-2 border-t-0 border-neutral-200 rounded-b-xl p-4 pt-0 overflow-hidden">
<div className="h-full border-solid border-2 border-t-0 border-neutral-200 rounded-b-xl p-4 pt-0 overflow-hidden">
{(selectedFile?.activeTab === PlaygroundTabs.PLAIN_TEXT || selectedFileIndex === null) && (
<ExtractContainer />
)}
{selectedFile?.activeTab === PlaygroundTabs.TABLE && <MapContainer />}
{selectedFile?.activeTab === PlaygroundTabs.KEY_VALUE_PAIR && <ExtractKeyValuePairContainer />}
</div>
)
) : (
Expand Down
223 changes: 223 additions & 0 deletions app/components/playground/ExtractKeyValuePairContainer.tsx
Original file line number Diff line number Diff line change
@@ -0,0 +1,223 @@
import { toast } from 'react-hot-toast';
import { useEffect, useMemo, useState } from 'react';
import { useProductionContext } from './ProductionContext';
import { ArrowLeft, Download } from '@phosphor-icons/react';
import { uploadFile } from '@/app/actions/uploadFile';
import { JobParams } from '@/app/actions/apiInterface';
import { runAsyncRequestJob } from '@/app/actions/runAsyncRequestJob';
import { PlaygroundFile, ExtractState, ProcessType, JobType } from '@/app/types/PlaygroundTypes';
import { runAsyncRequestJob as runPreprodAsyncRequestJob } from '@/app/actions/preprod/runAsyncRequestJob';
import Button from '../Button';
import CodeBlock from '../CodeBlock';
import DocumentViewer from '../DocumentViewer';
import KeyValueInputs from './KeyValueInputs';
import usePlaygroundStore from '@/app/hooks/usePlaygroundStore';
import ExtractKeyValuePairTutorial from '../tutorials/ExtractKeyValuePairTutorial';

const downloadExtractedData = (formattedData: string, file?: PlaygroundFile['file']) => {
if (!formattedData) return;

const fileName = file instanceof File ? file.name : 'extracted_data';
const blob = new Blob([formattedData], { type: 'application/json' });
const url = URL.createObjectURL(blob);
const a = document.createElement('a');
a.href = url;
const baseFileName = fileName.replace(/\.[^/.]+$/, '');
a.download = `${baseFileName}_extracted.json`;
document.body.appendChild(a);
a.click();
document.body.removeChild(a);
URL.revokeObjectURL(url);
};

const ExtractKeyValuePairContainer = () => {
const [hideResult, setHideResult] = useState(false);
const { apiURL, isProduction } = useProductionContext();
const { selectedFileIndex, files, updateFileAtIndex, token, userId, clientId, addFilesFormData } =
usePlaygroundStore();

const selectedFile = useMemo(() => {
if (selectedFileIndex !== null && files.length > 0) {
return files[selectedFileIndex];
}
}, [selectedFileIndex, files]);

useEffect(() => {
if (!selectedFile) return;

if (
selectedFile.extractKeyValueState === ExtractState.EXTRACTING ||
selectedFile.extractKeyValueState === ExtractState.UPLOADING
) {
toast.loading('Extracting data...', { id: 'key-value-extracting-toast' });
} else {
toast.dismiss('key-value-extracting-toast');
}
}, [selectedFile?.extractKeyValueState]);

const handleSuccess = async (response: any) => {
if (!response.data) {
toast.error(
`${selectedFile?.file instanceof File ? selectedFile.file.name : 'File'}: Received undefined result. Please try again.`
);
updateFileAtIndex(selectedFileIndex, 'extractKeyValueState', ExtractState.READY);
return;
}

const formattedResult = JSON.stringify(response.data.json[0], null, 2);

updateFileAtIndex(selectedFileIndex, 'extractKeyValueResult', formattedResult);
updateFileAtIndex(selectedFileIndex, 'extractKeyValueState', ExtractState.DONE_EXTRACTING);

toast.success('Extraction complete!');
};

const handleError = (error: any) => {
if (error.response) {
if (error.response.status === 429) {
toast.error('Extract limit reached.');
updateFileAtIndex(selectedFileIndex, 'extractKeyValueState', ExtractState.LIMIT_REACHED);
} else {
toast.error('Extraction failed. Please try again.');
updateFileAtIndex(selectedFileIndex, 'extractKeyValueState', ExtractState.READY);
}
} else {
toast.error('Error during extraction. Please try again.');
updateFileAtIndex(selectedFileIndex, 'extractKeyValueState', ExtractState.READY);
}
console.error(error);
};

const handleTimeout = () => {
updateFileAtIndex(selectedFileIndex, 'extractKeyValueState', ExtractState.READY);
toast.error('Extract request timed out. Please try again.');
};

const onSubmit = async (extractInstruction: Record<string, string>) => {
if (!selectedFile?.file) {
toast.error('Please select a file first');
return;
}

if (selectedFileIndex === null) {
toast.error('No file selected');
return;
}

try {
updateFileAtIndex(selectedFileIndex, 'extractKeyValueState', ExtractState.UPLOADING);
const file = selectedFile.file;

const jobParams: JobParams = {
vqaProcessorArgs: {
vqaExtractInstruction: extractInstruction,
},
};

// Upload file and get presigned url and metadata
const uploadResult = await uploadFile({
api_url: apiURL,
userId,
token,
file: file as File,
process_type: ProcessType.EXTRACT_KEY_VALUE,
extractArgs: {
extractInstruction,
},
addFilesFormData,
});

if (uploadResult instanceof Error) {
toast.error('Error uploading file. Please try again.');
updateFileAtIndex(selectedFileIndex, 'extractKeyValueState', ExtractState.READY);
return;
}

const fileData = uploadResult.data;
updateFileAtIndex(selectedFileIndex, 'extractKeyValueState', ExtractState.EXTRACTING);

// Common job parameters
const jobConfig = {
apiURL,
jobType: JobType.KEY_VALUE_EXTRACTION,
userId,
clientId,
fileId: fileData.fileId,
fileData,
selectedFile,
token,
sourceType: 's3',
jobParams,
selectedFileIndex,
filename: file instanceof File ? file.name : 'file',
handleError,
handleSuccess,
handleTimeout,
updateFileAtIndex,
} as const;

// Run the async job based on environment
const runJob = isProduction ? runAsyncRequestJob : runPreprodAsyncRequestJob;
await runJob(jobConfig);
} catch (error) {
toast.error('Extraction failed. Please try again.');
console.error(error);
updateFileAtIndex(selectedFileIndex, 'extractKeyValueState', ExtractState.READY);
}
};

const fileUrl = useMemo(() => {
if (!selectedFile?.file) return '';
if (typeof selectedFile.file === 'string') return selectedFile.file;
return URL.createObjectURL(selectedFile.file);
}, [selectedFile?.file]);

return (
<div className="h-full w-full pt-4 relative">
<div className="w-[calc(90%-11rem)] h-full overflow-auto overscroll-contain">
<ExtractKeyValuePairTutorial />
{fileUrl && (hideResult || !selectedFile?.extractKeyValueResult) && (
<div>
<DocumentViewer
fileType={selectedFile?.file instanceof File ? selectedFile.file.type : 'pdf'}
fileUrl={fileUrl}
/>
{selectedFile?.extractKeyValueResult && (
<div className="absolute bottom-4 left-4">
<Button label="Back to Result" labelIcon={ArrowLeft} onClick={() => setHideResult(false)} />
</div>
)}
</div>
)}
{!hideResult && selectedFile?.extractKeyValueResult && (
<div className="pb-24">
<CodeBlock language="json" code={selectedFile?.extractKeyValueResult} aria-label="Extraction Result" />
<div className="absolute bottom-4 left-4 flex gap-2 w-fit">
<Button label="Back to File" labelIcon={ArrowLeft} onClick={() => setHideResult(true)} />
<Button
label="Download"
labelIcon={Download}
onClick={() => downloadExtractedData(selectedFile?.extractKeyValueResult, selectedFile?.file)}
/>
</div>
</div>
)}
</div>
<div className="h-[calc(100%-1rem)] min-w-60 max-w-72 w-[18vw] p-4 rounded-2xl shadow-[0px_0px_4px_2px_rgba(0,_0,_0,_0.1)] absolute top-4 right-0">
<div className="w-full max-h-full overflow-hidden flex flex-col gap-4">
<div className="flex flex-col gap-2">
<KeyValueInputs
onSubmit={onSubmit}
isLoading={
selectedFile?.extractKeyValueState === ExtractState.EXTRACTING ||
selectedFile?.extractKeyValueState === ExtractState.UPLOADING
}
/>
</div>
</div>
</div>
</div>
);
};

export default ExtractKeyValuePairContainer;
Loading

0 comments on commit eb678e7

Please sign in to comment.