From 71536ff1eeaadbf746e452effdc280b46505c777 Mon Sep 17 00:00:00 2001 From: psainics Date: Mon, 4 Mar 2024 04:16:49 +0530 Subject: [PATCH] Streaming ? --- .../io/cdap/plugin/batch/source/ExcelInputFormat.java | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/core-plugins/src/main/java/io/cdap/plugin/batch/source/ExcelInputFormat.java b/core-plugins/src/main/java/io/cdap/plugin/batch/source/ExcelInputFormat.java index b22ae8369..1b755539d 100644 --- a/core-plugins/src/main/java/io/cdap/plugin/batch/source/ExcelInputFormat.java +++ b/core-plugins/src/main/java/io/cdap/plugin/batch/source/ExcelInputFormat.java @@ -16,6 +16,7 @@ package io.cdap.plugin.batch.source; +import com.github.pjfanning.xlsx.StreamingReader; import com.google.common.base.Preconditions; import com.google.common.base.Strings; import org.apache.hadoop.conf.Configuration; @@ -146,8 +147,10 @@ public void initialize(InputSplit genericSplit, TaskAttemptContext context) thro String sheetValue = job.get(SHEET_VALUE); Sheet workSheet; // sheet can be used as common for XSSF and HSSF workbook + // match regex pattern *.xls or *.xlsx try { - Workbook workbook = WorkbookFactory.create(fileIn); + // Workbook workbook = WorkbookFactory.create(fileIn); + Workbook workbook = StreamingReader.builder().rowCacheSize(10).open(fileIn); if (sheet.equalsIgnoreCase(SHEET_NAME)) { workSheet = workbook.getSheet(sheetValue); } else { @@ -157,7 +160,8 @@ public void initialize(InputSplit genericSplit, TaskAttemptContext context) thro throw new IllegalArgumentException("Exception while reading excel sheet. " + e.getMessage(), e); } - rowCount = job.getInt(ROWS_LIMIT, workSheet.getPhysicalNumberOfRows()); +// rowCount = job.getInt(ROWS_LIMIT, workSheet.getPhysicalNumberOfRows()); + rowCount = job.getInt(ROWS_LIMIT, 10000); rows = workSheet.iterator(); lastRowNum = workSheet.getLastRowNum(); rowIdx = 0;