From 197e9a3c948e10265187f1505fd6271b807caaac Mon Sep 17 00:00:00 2001
From: ChengJie1053 <18033291053@163.com>
Date: Wed, 6 Dec 2023 17:13:46 +0800
Subject: [PATCH 01/16] Support for storing result sets in Parquet format
---
linkis-commons/linkis-storage/pom.xml | 30 +++++
.../storage/conf/LinkisStorageConf.java | 13 +-
.../resultset/DefaultResultSetFactory.java | 22 +++-
.../resultset/ParquetResultSetReader.java | 122 ++++++++++++++++++
.../resultset/ParquetResultSetWriter.java | 95 ++++++++++++++
.../resultset/ResultSetReaderFactory.java | 20 ++-
.../resultset/ResultSetWriterFactory.java | 19 ++-
.../storage/resultset/StorageResultSet.java | 12 +-
.../resultset/StorageResultSetReader.java | 8 +-
.../resultset/StorageResultSetWriter.java | 24 ++--
.../linkis/storage/source/FileSource.java | 13 +-
.../linkis/storage/utils/StorageHelper.java | 5 +-
.../filesystem/restful/api/FsRestfulApi.java | 4 +
13 files changed, 351 insertions(+), 36 deletions(-)
create mode 100644 linkis-commons/linkis-storage/src/main/java/org/apache/linkis/storage/resultset/ParquetResultSetReader.java
create mode 100644 linkis-commons/linkis-storage/src/main/java/org/apache/linkis/storage/resultset/ParquetResultSetWriter.java
diff --git a/linkis-commons/linkis-storage/pom.xml b/linkis-commons/linkis-storage/pom.xml
index def795ebd8..c9c610bcbd 100644
--- a/linkis-commons/linkis-storage/pom.xml
+++ b/linkis-commons/linkis-storage/pom.xml
@@ -99,6 +99,36 @@
aws-java-sdk-s3
1.12.261
+
+
+ org.apache.parquet
+ parquet-avro
+ 1.12.0
+
+
+ org.apache.hadoop
+ hadoop-mapreduce-client-core
+ ${hadoop.version}
+
+
+ log4j
+ log4j
+
+
+ org.slf4j
+ slf4j-log4j12
+
+
+
+ ch.qos.reload4j
+ reload4j
+
+
+ org.slf4j
+ slf4j-reload4j
+
+
+
diff --git a/linkis-commons/linkis-storage/src/main/java/org/apache/linkis/storage/conf/LinkisStorageConf.java b/linkis-commons/linkis-storage/src/main/java/org/apache/linkis/storage/conf/LinkisStorageConf.java
index 74950c15fe..916f9ff469 100644
--- a/linkis-commons/linkis-storage/src/main/java/org/apache/linkis/storage/conf/LinkisStorageConf.java
+++ b/linkis-commons/linkis-storage/src/main/java/org/apache/linkis/storage/conf/LinkisStorageConf.java
@@ -25,6 +25,12 @@
public class LinkisStorageConf {
private static final Object CONF_LOCK = new Object();
+ public static final String DOLPHIN = "dolphin";
+
+ public static final String PARQUET = "parquet";
+
+ public static final String PARQUET_FILE_SUFFIX = ".parquet";
+
public static final String HDFS_FILE_SYSTEM_REST_ERRS =
CommonVars.apply(
"wds.linkis.hdfs.rest.errs",
@@ -34,12 +40,17 @@ public class LinkisStorageConf {
public static final String ROW_BYTE_MAX_LEN_STR =
CommonVars.apply("wds.linkis.resultset.row.max.str", "2m").getValue();
+ public static final String ENGINE_RESULT_TYPE =
+ CommonVars.apply("wds.linkis.engine.resultSet.type", DOLPHIN, "Result type").getValue();
+
public static final long ROW_BYTE_MAX_LEN = ByteTimeUtils.byteStringAsBytes(ROW_BYTE_MAX_LEN_STR);
public static final String FILE_TYPE =
CommonVars.apply(
"wds.linkis.storage.file.type",
- "dolphin,sql,scala,py,hql,python,out,log,text,txt,sh,jdbc,ngql,psql,fql,tsql")
+ "dolphin,sql,scala,py,hql,python,out,log,text,txt,sh,jdbc,ngql,psql,fql,tsql"
+ + ","
+ + PARQUET)
.getValue();
private static volatile String[] fileTypeArr = null;
diff --git a/linkis-commons/linkis-storage/src/main/java/org/apache/linkis/storage/resultset/DefaultResultSetFactory.java b/linkis-commons/linkis-storage/src/main/java/org/apache/linkis/storage/resultset/DefaultResultSetFactory.java
index db78afac29..9ac4c02cc7 100644
--- a/linkis-commons/linkis-storage/src/main/java/org/apache/linkis/storage/resultset/DefaultResultSetFactory.java
+++ b/linkis-commons/linkis-storage/src/main/java/org/apache/linkis/storage/resultset/DefaultResultSetFactory.java
@@ -23,6 +23,7 @@
import org.apache.linkis.common.io.Record;
import org.apache.linkis.common.io.resultset.ResultSet;
import org.apache.linkis.storage.FSFactory;
+import org.apache.linkis.storage.conf.LinkisStorageConf;
import org.apache.linkis.storage.domain.Dolphin;
import org.apache.linkis.storage.errorcode.LinkisStorageErrorCodeSummary;
import org.apache.linkis.storage.exception.StorageWarnException;
@@ -134,15 +135,22 @@ public String[] getResultSetType() {
@Override
public ResultSet extends MetaData, ? extends Record> getResultSetByPath(FsPath fsPath, Fs fs) {
+ ResultSet resultSet = null;
try (InputStream inputStream = fs.read(fsPath)) {
- String resultSetType = Dolphin.getType(inputStream);
- if (StringUtils.isEmpty(resultSetType)) {
- throw new StorageWarnException(
- THE_FILE_IS_EMPTY.getErrorCode(),
- MessageFormat.format(THE_FILE_IS_EMPTY.getErrorDesc(), fsPath.getPath()));
+ String engineResultType = LinkisStorageConf.ENGINE_RESULT_TYPE;
+ if (engineResultType.equals(LinkisStorageConf.DOLPHIN)) {
+ String resultSetType = Dolphin.getType(inputStream);
+ if (StringUtils.isEmpty(resultSetType)) {
+ throw new StorageWarnException(
+ THE_FILE_IS_EMPTY.getErrorCode(),
+ MessageFormat.format(THE_FILE_IS_EMPTY.getErrorDesc(), fsPath.getPath()));
+ }
+ // Utils.tryQuietly(fs::close);
+ resultSet = getResultSetByType(resultSetType);
+ } else if (engineResultType.equals(LinkisStorageConf.PARQUET)) {
+ resultSet = getResultSetByType(ResultSetFactory.TABLE_TYPE);
}
- // Utils.tryQuietly(fs::close);
- return getResultSetByType(resultSetType);
+ return resultSet;
} catch (IOException e) {
throw new RuntimeException(e);
}
diff --git a/linkis-commons/linkis-storage/src/main/java/org/apache/linkis/storage/resultset/ParquetResultSetReader.java b/linkis-commons/linkis-storage/src/main/java/org/apache/linkis/storage/resultset/ParquetResultSetReader.java
new file mode 100644
index 0000000000..07c6c4cd7a
--- /dev/null
+++ b/linkis-commons/linkis-storage/src/main/java/org/apache/linkis/storage/resultset/ParquetResultSetReader.java
@@ -0,0 +1,122 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.linkis.storage.resultset;
+
+import org.apache.linkis.common.io.FsPath;
+import org.apache.linkis.common.io.MetaData;
+import org.apache.linkis.common.io.Record;
+import org.apache.linkis.common.io.resultset.ResultSet;
+import org.apache.linkis.storage.domain.Column;
+import org.apache.linkis.storage.domain.DataType;
+import org.apache.linkis.storage.resultset.table.TableMetaData;
+import org.apache.linkis.storage.resultset.table.TableRecord;
+
+import org.apache.avro.Schema;
+import org.apache.avro.generic.GenericRecord;
+import org.apache.hadoop.fs.Path;
+import org.apache.parquet.avro.AvroParquetReader;
+import org.apache.parquet.hadoop.ParquetReader;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.stream.Collectors;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class ParquetResultSetReader
+ extends StorageResultSetReader {
+
+ private static final Logger logger = LoggerFactory.getLogger(ParquetResultSetReader.class);
+
+ private FsPath fsPath;
+
+ private ParquetReader parquetReader;
+
+ private GenericRecord record;
+
+ public ParquetResultSetReader(ResultSet resultSet, InputStream inputStream, FsPath fsPath)
+ throws IOException {
+ super(resultSet, inputStream);
+ this.fsPath = fsPath;
+ this.parquetReader =
+ AvroParquetReader.builder(new Path(fsPath.getPath())).build();
+ this.record = parquetReader.read();
+ }
+
+ @Override
+ public MetaData getMetaData() {
+ if (metaData == null) {
+ try {
+ List fields = record.getSchema().getFields();
+ List columnList =
+ fields.stream()
+ .map(
+ field ->
+ new Column(
+ field.name(),
+ DataType.toDataType(field.schema().getType().getName()),
+ ""))
+ .collect(Collectors.toList());
+
+ metaData = new TableMetaData(columnList.toArray(new Column[0]));
+ } catch (Exception e) {
+ throw new RuntimeException("Failed to read parquet schema", e);
+ }
+ }
+ return metaData;
+ }
+
+ @Override
+ public boolean hasNext() throws IOException {
+ if (metaData == null) getMetaData();
+ if (record == null) return false;
+ ArrayList
-
-
- storage-parquet
-
- compile
-
-
-
-
- storage-orc
-
- compile
-
-
From 8d5c5ac5552cfe3ecd5ed6f43979180e47713e42 Mon Sep 17 00:00:00 2001
From: ChengJie1053 <18033291053@163.com>
Date: Fri, 19 Jan 2024 10:27:06 +0800
Subject: [PATCH 14/16] maven packaging optimization
---
pom.xml | 1 -
1 file changed, 1 deletion(-)
diff --git a/pom.xml b/pom.xml
index a6792badf0..f8476db3d7 100644
--- a/pom.xml
+++ b/pom.xml
@@ -121,7 +121,6 @@
${hadoop.version}
provided
-
1.16.2
0.9.3
1.3.0
From 5c4c20fdf4ec92c132f237683fd80ffd3646be03 Mon Sep 17 00:00:00 2001
From: ChengJie1053 <18033291053@163.com>
Date: Mon, 22 Jan 2024 15:33:05 +0800
Subject: [PATCH 15/16] Override skip method
---
.../linkis/storage/resultset/OrcResultSetReader.java | 11 ++++++++++-
.../storage/resultset/ParquetResultSetReader.java | 11 ++++++++++-
2 files changed, 20 insertions(+), 2 deletions(-)
diff --git a/linkis-commons/linkis-storage/src/main/java/org/apache/linkis/storage/resultset/OrcResultSetReader.java b/linkis-commons/linkis-storage/src/main/java/org/apache/linkis/storage/resultset/OrcResultSetReader.java
index 6df61c7998..249e326cde 100644
--- a/linkis-commons/linkis-storage/src/main/java/org/apache/linkis/storage/resultset/OrcResultSetReader.java
+++ b/linkis-commons/linkis-storage/src/main/java/org/apache/linkis/storage/resultset/OrcResultSetReader.java
@@ -101,7 +101,16 @@ public MetaData getMetaData() {
@Override
public int skip(int recordNum) throws IOException {
- throw new UnsupportedOperationException("Storeage Unsupported type: skip");
+ if (recordNum < 0) return -1;
+
+ for (int i = recordNum; i > 0; i--) {
+ try {
+ hasNext();
+ } catch (Throwable t) {
+ return recordNum - i;
+ }
+ }
+ return recordNum;
}
@Override
diff --git a/linkis-commons/linkis-storage/src/main/java/org/apache/linkis/storage/resultset/ParquetResultSetReader.java b/linkis-commons/linkis-storage/src/main/java/org/apache/linkis/storage/resultset/ParquetResultSetReader.java
index 9774ba1dba..c09804294d 100644
--- a/linkis-commons/linkis-storage/src/main/java/org/apache/linkis/storage/resultset/ParquetResultSetReader.java
+++ b/linkis-commons/linkis-storage/src/main/java/org/apache/linkis/storage/resultset/ParquetResultSetReader.java
@@ -98,7 +98,16 @@ public MetaData getMetaData() {
@Override
public int skip(int recordNum) throws IOException {
- throw new UnsupportedOperationException("Storeage Unsupported type: skip");
+ if (recordNum < 0) return -1;
+
+ for (int i = recordNum; i > 0; i--) {
+ try {
+ this.record = parquetReader.read();
+ } catch (Throwable t) {
+ return recordNum - i;
+ }
+ }
+ return recordNum;
}
@Override
From c718a8d3e7af359e3a98abb74124252db19b3b32 Mon Sep 17 00:00:00 2001
From: ChengJie1053 <18033291053@163.com>
Date: Mon, 22 Jan 2024 18:53:35 +0800
Subject: [PATCH 16/16] maven packaging optimization
---
linkis-commons/linkis-storage/pom.xml | 3 +++
pom.xml | 17 +++++++++++++++++
2 files changed, 20 insertions(+)
diff --git a/linkis-commons/linkis-storage/pom.xml b/linkis-commons/linkis-storage/pom.xml
index 93628db808..2f1cdb44bd 100644
--- a/linkis-commons/linkis-storage/pom.xml
+++ b/linkis-commons/linkis-storage/pom.xml
@@ -104,11 +104,13 @@
org.apache.parquet
parquet-avro
${parquet-avro.version}
+ ${storage.parquet.scope}
org.apache.hadoop
hadoop-mapreduce-client-core
${hadoop.version}
+ ${storage.parquet.scope}
log4j
@@ -134,6 +136,7 @@
orc-core
${orc-core.version}
nohive
+ ${storage.orc.scope}
org.apache.hive
diff --git a/pom.xml b/pom.xml
index f8476db3d7..d27ef8f9a8 100644
--- a/pom.xml
+++ b/pom.xml
@@ -121,6 +121,9 @@
${hadoop.version}
provided
+ provided
+ provided
+
1.16.2
0.9.3
1.3.0
@@ -1944,5 +1947,19 @@
+
+
+ storage-parquet
+
+ compile
+
+
+
+
+ storage-orc
+
+ compile
+
+