Skip to content

Commit

Permalink
Extract tables from PDF to CSV using Tabula (#2312)
Browse files Browse the repository at this point in the history
* Add Tabula dependency and exclude slf4j-simple

- Add tabula-java dependency to extract tables into CSV.
- Exclude slf4j-simple due to Logback

* Add a flexible CSVWriter

- Add FlexibleCSVWriter which extends CSVWriter to pass a custom CSVFormat, as CSVWriter's parameterized constructor (that allows changing CSVFormat) is protected.

* Use Tabula in extracting tables from PDF

- Use Tabula in extracting tables from PDF instead of the existing implementation

* Delete PDFTableStripper as It is unneeded

- Delete PDFTableStripper as It is unneeded as Tabula-Java is used instead.

* Use correct class in ExtractCSVController logger

* Exclude gson and bcprov-jdk15on dependencies from tabula

- Exclude gson and bcprov-jdk15on from tabula-java due to detected security vulnerabilities.
  • Loading branch information
omar-ahmed42 authored Nov 23, 2024
1 parent faa8a97 commit afad06b
Show file tree
Hide file tree
Showing 4 changed files with 43 additions and 419 deletions.
7 changes: 7 additions & 0 deletions build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,13 @@ dependencies {
exclude group: "commons-logging", module: "commons-logging"
}

// https://mvnrepository.com/artifact/technology.tabula/tabula
implementation ('technology.tabula:tabula:1.0.5') {
exclude group: "org.slf4j", module: "slf4j-simple"
exclude group: "org.bouncycastle", module: "bcprov-jdk15on"
exclude group: "com.google.code.gson", module: "gson"
}

implementation 'org.apache.pdfbox:jbig2-imageio:3.0.4'

implementation "org.bouncycastle:bcprov-jdk18on:$bouncycastleVersion"
Expand Down
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
package stirling.software.SPDF.controller.api.converters;

import java.io.StringWriter;
import java.util.ArrayList;
import java.util.List;

import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.QuoteMode;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.http.ContentDisposition;
Expand All @@ -18,79 +18,36 @@
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RestController;

import com.opencsv.CSVWriter;

import io.swagger.v3.oas.annotations.Operation;
import io.swagger.v3.oas.annotations.tags.Tag;

import stirling.software.SPDF.controller.api.CropController;
import stirling.software.SPDF.controller.api.strippers.PDFTableStripper;
import stirling.software.SPDF.model.api.extract.PDFFilePage;
import stirling.software.SPDF.pdf.FlexibleCSVWriter;
import technology.tabula.ObjectExtractor;
import technology.tabula.Page;
import technology.tabula.Table;
import technology.tabula.extractors.SpreadsheetExtractionAlgorithm;
import technology.tabula.writers.Writer;

@RestController
@RequestMapping("/api/v1/convert")
@Tag(name = "Convert", description = "Convert APIs")
public class ExtractCSVController {

private static final Logger logger = LoggerFactory.getLogger(CropController.class);
private static final Logger logger = LoggerFactory.getLogger(ExtractCSVController.class);

@PostMapping(value = "/pdf/csv", consumes = "multipart/form-data")
@Operation(
summary = "Extracts a CSV document from a PDF",
description =
"This operation takes an input PDF file and returns CSV file of whole page. Input:PDF Output:CSV Type:SISO")
@Operation(summary = "Extracts a CSV document from a PDF", description = "This operation takes an input PDF file and returns CSV file of whole page. Input:PDF Output:CSV Type:SISO")
public ResponseEntity<String> PdfToCsv(@ModelAttribute PDFFilePage form) throws Exception {

ArrayList<String> tableData = new ArrayList<>();
int columnsCount = 0;

try (PDDocument document = Loader.loadPDF(form.getFileInput().getBytes())) {
final double res = 72; // PDF units are at 72 DPI
PDFTableStripper stripper = new PDFTableStripper();
PDPage pdPage = document.getPage(form.getPageId() - 1);
stripper.extractTable(pdPage);
columnsCount = stripper.getColumns();
for (int c = 0; c < columnsCount; ++c) {
for (int r = 0; r < stripper.getRows(); ++r) {
tableData.add(stripper.getText(r, c));
}
}
}

ArrayList<String> notEmptyColumns = new ArrayList<>();

for (String item : tableData) {
if (!item.trim().isEmpty()) {
notEmptyColumns.add(item);
} else {
columnsCount--;
}
}

List<String> fullTable =
notEmptyColumns.stream()
.map(
(entity) ->
entity.replace('\n', ' ')
.replace('\r', ' ')
.trim()
.replaceAll("\\s{2,}", "|"))
.toList();

int rowsCount = fullTable.get(0).split("\\|").length;

ArrayList<String> headersList = getTableHeaders(columnsCount, fullTable);
ArrayList<String> recordList = getRecordsList(rowsCount, fullTable);

if (headersList.size() == 0 && recordList.size() == 0) {
throw new Exception("No table detected, no headers or records found");
}

StringWriter writer = new StringWriter();
try (CSVWriter csvWriter = new CSVWriter(writer)) {
csvWriter.writeNext(headersList.toArray(new String[0]));
for (String record : recordList) {
csvWriter.writeNext(record.split("\\|"));
try (PDDocument document = Loader.loadPDF(form.getFileInput().getBytes())) {
CSVFormat format = CSVFormat.EXCEL.builder().setEscape('"').setQuoteMode(QuoteMode.ALL).build();
Writer csvWriter = new FlexibleCSVWriter(format);
SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm();
try (ObjectExtractor extractor = new ObjectExtractor(document)) {
Page page = extractor.extract(form.getPageId());
List<Table> tables = sea.extract(page);
csvWriter.write(writer, tables);
}
}

Expand All @@ -99,41 +56,12 @@ public ResponseEntity<String> PdfToCsv(@ModelAttribute PDFFilePage form) throws
ContentDisposition.builder("attachment")
.filename(
form.getFileInput()
.getOriginalFilename()
.replaceFirst("[.][^.]+$", "")
.getOriginalFilename()
.replaceFirst("[.][^.]+$", "")
+ "_extracted.csv")
.build());
headers.setContentType(MediaType.parseMediaType("text/csv"));

return ResponseEntity.ok().headers(headers).body(writer.toString());
}

private ArrayList<String> getRecordsList(int rowsCounts, List<String> items) {
ArrayList<String> recordsList = new ArrayList<>();

for (int b = 1; b < rowsCounts; b++) {
StringBuilder strbldr = new StringBuilder();

for (int i = 0; i < items.size(); i++) {
String[] parts = items.get(i).split("\\|");
strbldr.append(parts[b]);
if (i != items.size() - 1) {
strbldr.append("|");
}
}
recordsList.add(strbldr.toString());
}

return recordsList;
}

private ArrayList<String> getTableHeaders(int columnsCount, List<String> items) {
ArrayList<String> resultList = new ArrayList<>();
for (int i = 0; i < columnsCount; i++) {
String[] parts = items.get(i).split("\\|");
resultList.add(parts[0]);
}

return resultList;
}
}
Loading

0 comments on commit afad06b

Please sign in to comment.