Skip to content

Commit

Permalink
MAGIC !
Browse files Browse the repository at this point in the history
  • Loading branch information
psainics committed Mar 3, 2024
1 parent 71536ff commit fef09cf
Show file tree
Hide file tree
Showing 2 changed files with 43 additions and 7 deletions.
12 changes: 12 additions & 0 deletions core-plugins/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,18 @@
<artifactId>poi-ooxml</artifactId>
<version>5.2.4</version>
</dependency>
<dependency>
<groupId>com.github.pjfanning</groupId>
<artifactId>excel-streaming-reader</artifactId>
<scope>compile</scope>
<version>4.2.1</version>
</dependency>
<dependency>
<groupId>com.github.pjfanning</groupId>
<artifactId>poi-shared-strings</artifactId>
<scope>compile</scope>
<version>2.8.0</version>
</dependency>
<dependency>
<scope>test</scope>
<groupId>org.apache.sshd</groupId>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.poi.EmptyFileException;
import org.apache.poi.poifs.filesystem.FileMagic;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.DateUtil;
import org.apache.poi.ss.usermodel.Row;
Expand All @@ -40,6 +42,7 @@
import org.apache.poi.ss.util.CellReference;

import java.io.IOException;
import java.io.InputStream;
import java.util.Iterator;


Expand Down Expand Up @@ -146,11 +149,31 @@ public void initialize(InputSplit genericSplit, TaskAttemptContext context) thro
String sheet = job.get(SHEET);
String sheetValue = job.get(SHEET_VALUE);

Sheet workSheet; // sheet can be used as common for XSSF and HSSF workbook
// match regex pattern *.xls or *.xlsx
Sheet workSheet;
Workbook workbook;
boolean isStreaming = false;
try {
// Workbook workbook = WorkbookFactory.create(fileIn);
Workbook workbook = StreamingReader.builder().rowCacheSize(10).open(fileIn);
// Use Magic Bytes to detect the file type
InputStream is = FileMagic.prepareToCheckMagic(fileIn);
byte[] emptyFileCheck = new byte[1];
is.mark(emptyFileCheck.length);
if (is.read(emptyFileCheck) < emptyFileCheck.length) {
throw new EmptyFileException();
}
is.reset();

final FileMagic fm = FileMagic.valueOf(is);
switch (fm) {
case OOXML:
workbook = StreamingReader.builder().rowCacheSize(10).open(fileIn);
isStreaming = true;
break;
case OLE2:
workbook = WorkbookFactory.create(fileIn);
break;
default:
throw new IOException("Can't open workbook - unsupported file type: " + fm);
}
if (sheet.equalsIgnoreCase(SHEET_NAME)) {
workSheet = workbook.getSheet(sheetValue);
} else {
Expand All @@ -160,8 +183,9 @@ public void initialize(InputSplit genericSplit, TaskAttemptContext context) thro
throw new IllegalArgumentException("Exception while reading excel sheet. " + e.getMessage(), e);
}

// rowCount = job.getInt(ROWS_LIMIT, workSheet.getPhysicalNumberOfRows());
rowCount = job.getInt(ROWS_LIMIT, 10000);
// As we cannot get the number of rows in a sheet while streaming.
// -1 is used as rowCount to indicate that all rows should be read.
rowCount = job.getInt(ROWS_LIMIT, isStreaming ? -1 : workSheet.getPhysicalNumberOfRows());
rows = workSheet.iterator();
lastRowNum = workSheet.getLastRowNum();
rowIdx = 0;
Expand All @@ -175,7 +199,7 @@ public void initialize(InputSplit genericSplit, TaskAttemptContext context) thro
}

@Override
public boolean nextKeyValue() throws IOException, InterruptedException {
public boolean nextKeyValue() {
if (!rows.hasNext() || rowCount == 0) {
return false;
}
Expand Down

0 comments on commit fef09cf

Please sign in to comment.