From 85956571f50e4bf6aa2cf6c390b67113d5732ce4 Mon Sep 17 00:00:00 2001 From: rahulbhatia023 Date: Tue, 12 Dec 2023 16:47:51 -0500 Subject: [PATCH 1/2] Modifying type class hierarchy --- .../aws/s3/DataFrameToS3BucketWriter.scala | 14 +++++++----- .../instances/DataFrameToCSVFileWriter.scala | 3 ++- .../instances/DataFrameToJSONFileWriter.scala | 3 ++- .../DataFrameToParquetFileWriter.scala | 3 ++- .../instances/DataFrameToS3FileWriter.scala | 13 ----------- .../instances/DataFrameToXMLFileWriter.scala | 3 ++- .../s3/DataFrameToCSVS3BucketWriterSpec.scala | 6 +++-- .../DataFrameToJSONS3BucketWriterSpec.scala | 6 +++-- ...DataFrameToParquetS3BucketWriterSpec.scala | 6 +++-- .../s3/DataFrameToXMLS3BucketWriterSpec.scala | 6 +++-- .../bigquery/DataFrameToBigQueryWriter.scala | 22 +++++++------------ .../instances/DataFrameToBQWriter.scala | 15 ------------- .../instances/DataFrameToDirectBQWriter.scala | 3 ++- .../DataFrameToIndirectBQWriter.scala | 3 ++- .../gcp/gcs/DataFrameToGCSBucketWriter.scala | 14 +++++++----- .../instances/DataFrameToCSVFileWriter.scala | 3 ++- .../instances/DataFrameToGCSFileWriter.scala | 13 ----------- .../instances/DataFrameToJSONFileWriter.scala | 3 ++- .../DataFrameToParquetFileWriter.scala | 3 ++- .../instances/DataFrameToXMLFileWriter.scala | 3 ++- .../DataFrameToLocalFileSystemWriter.scala | 14 +++++++----- .../instances/DataFrameToCSVFileWriter.scala | 3 ++- .../instances/DataFrameToFileWriter.scala | 7 ------ .../instances/DataFrameToJSONFileWriter.scala | 3 ++- .../DataFrameToParquetFileWriter.scala | 3 ++- .../instances/DataFrameToXMLFileWriter.scala | 3 ++- ...aFrameToCSVLocalFileSystemWriterSpec.scala | 22 ++++++++++--------- ...FrameToJSONLocalFileSystemWriterSpec.scala | 6 +++-- ...meToParquetLocalFileSystemWriterSpec.scala | 4 +++- ...aFrameToXMLLocalFileSystemWriterSpec.scala | 14 +++++++----- 30 files changed, 103 insertions(+), 121 deletions(-) delete mode 100644 aws/src/main/scala/com/clairvoyant/data/scalaxy/writer/aws/s3/instances/DataFrameToS3FileWriter.scala delete mode 100644 gcp/src/main/scala/com/clairvoyant/data/scalaxy/writer/gcp/bigquery/instances/DataFrameToBQWriter.scala delete mode 100644 gcp/src/main/scala/com/clairvoyant/data/scalaxy/writer/gcp/gcs/instances/DataFrameToGCSFileWriter.scala delete mode 100644 local-file-system/src/main/scala/com/clairvoyant/data/scalaxy/writer/local/file/instances/DataFrameToFileWriter.scala diff --git a/aws/src/main/scala/com/clairvoyant/data/scalaxy/writer/aws/s3/DataFrameToS3BucketWriter.scala b/aws/src/main/scala/com/clairvoyant/data/scalaxy/writer/aws/s3/DataFrameToS3BucketWriter.scala index 28597e7..72de4ac 100644 --- a/aws/src/main/scala/com/clairvoyant/data/scalaxy/writer/aws/s3/DataFrameToS3BucketWriter.scala +++ b/aws/src/main/scala/com/clairvoyant/data/scalaxy/writer/aws/s3/DataFrameToS3BucketWriter.scala @@ -1,18 +1,20 @@ package com.clairvoyant.data.scalaxy.writer.aws.s3 import com.clairvoyant.data.scalaxy.writer.aws.s3.formats.FileFormat -import com.clairvoyant.data.scalaxy.writer.aws.s3.instances.DataFrameToS3FileWriter import org.apache.spark.sql.{DataFrame, SaveMode} -object DataFrameToS3BucketWriter { +trait DataFrameToS3BucketWriter[T]: - def write[T <: FileFormat]( + def write( dataFrame: DataFrame, fileFormat: T, bucketName: String, path: String, saveMode: SaveMode = SaveMode.Overwrite - )(using dataFrameToS3FileWriter: DataFrameToS3FileWriter[T]): Unit = - dataFrameToS3FileWriter.write(dataFrame, fileFormat, bucketName, path, saveMode) + ): Unit -} +object DataFrameToS3BucketWriter: + + def apply[T <: FileFormat]( + using dataFrameToS3BucketWriter: DataFrameToS3BucketWriter[T] + ): DataFrameToS3BucketWriter[T] = dataFrameToS3BucketWriter diff --git a/aws/src/main/scala/com/clairvoyant/data/scalaxy/writer/aws/s3/instances/DataFrameToCSVFileWriter.scala b/aws/src/main/scala/com/clairvoyant/data/scalaxy/writer/aws/s3/instances/DataFrameToCSVFileWriter.scala index cce7128..2261445 100644 --- a/aws/src/main/scala/com/clairvoyant/data/scalaxy/writer/aws/s3/instances/DataFrameToCSVFileWriter.scala +++ b/aws/src/main/scala/com/clairvoyant/data/scalaxy/writer/aws/s3/instances/DataFrameToCSVFileWriter.scala @@ -1,9 +1,10 @@ package com.clairvoyant.data.scalaxy.writer.aws.s3.instances +import com.clairvoyant.data.scalaxy.writer.aws.s3.DataFrameToS3BucketWriter import com.clairvoyant.data.scalaxy.writer.aws.s3.formats.CSVFileFormat import org.apache.spark.sql.{DataFrame, SaveMode} -implicit object DataFrameToCSVFileWriter extends DataFrameToS3FileWriter[CSVFileFormat] { +implicit object DataFrameToCSVFileWriter extends DataFrameToS3BucketWriter[CSVFileFormat] { import org.apache.spark.sql.catalyst.csv.CSVOptions.* diff --git a/aws/src/main/scala/com/clairvoyant/data/scalaxy/writer/aws/s3/instances/DataFrameToJSONFileWriter.scala b/aws/src/main/scala/com/clairvoyant/data/scalaxy/writer/aws/s3/instances/DataFrameToJSONFileWriter.scala index f733fc7..5a10dd1 100644 --- a/aws/src/main/scala/com/clairvoyant/data/scalaxy/writer/aws/s3/instances/DataFrameToJSONFileWriter.scala +++ b/aws/src/main/scala/com/clairvoyant/data/scalaxy/writer/aws/s3/instances/DataFrameToJSONFileWriter.scala @@ -1,9 +1,10 @@ package com.clairvoyant.data.scalaxy.writer.aws.s3.instances +import com.clairvoyant.data.scalaxy.writer.aws.s3.DataFrameToS3BucketWriter import com.clairvoyant.data.scalaxy.writer.aws.s3.formats.JSONFileFormat import org.apache.spark.sql.{DataFrame, SaveMode} -implicit object DataFrameToJSONFileWriter extends DataFrameToS3FileWriter[JSONFileFormat] { +implicit object DataFrameToJSONFileWriter extends DataFrameToS3BucketWriter[JSONFileFormat] { import org.apache.spark.sql.catalyst.json.JSONOptions.* diff --git a/aws/src/main/scala/com/clairvoyant/data/scalaxy/writer/aws/s3/instances/DataFrameToParquetFileWriter.scala b/aws/src/main/scala/com/clairvoyant/data/scalaxy/writer/aws/s3/instances/DataFrameToParquetFileWriter.scala index 6629c89..cc95bb9 100644 --- a/aws/src/main/scala/com/clairvoyant/data/scalaxy/writer/aws/s3/instances/DataFrameToParquetFileWriter.scala +++ b/aws/src/main/scala/com/clairvoyant/data/scalaxy/writer/aws/s3/instances/DataFrameToParquetFileWriter.scala @@ -1,9 +1,10 @@ package com.clairvoyant.data.scalaxy.writer.aws.s3.instances +import com.clairvoyant.data.scalaxy.writer.aws.s3.DataFrameToS3BucketWriter import com.clairvoyant.data.scalaxy.writer.aws.s3.formats.ParquetFileFormat import org.apache.spark.sql.{DataFrame, SaveMode} -implicit object DataFrameToParquetFileWriter extends DataFrameToS3FileWriter[ParquetFileFormat] { +implicit object DataFrameToParquetFileWriter extends DataFrameToS3BucketWriter[ParquetFileFormat] { override def write( dataFrame: DataFrame, diff --git a/aws/src/main/scala/com/clairvoyant/data/scalaxy/writer/aws/s3/instances/DataFrameToS3FileWriter.scala b/aws/src/main/scala/com/clairvoyant/data/scalaxy/writer/aws/s3/instances/DataFrameToS3FileWriter.scala deleted file mode 100644 index 5c83e7f..0000000 --- a/aws/src/main/scala/com/clairvoyant/data/scalaxy/writer/aws/s3/instances/DataFrameToS3FileWriter.scala +++ /dev/null @@ -1,13 +0,0 @@ -package com.clairvoyant.data.scalaxy.writer.aws.s3.instances - -import org.apache.spark.sql.{DataFrame, SaveMode} - -trait DataFrameToS3FileWriter[T]: - - def write( - dataFrame: DataFrame, - fileFormat: T, - bucketName: String, - path: String, - saveMode: SaveMode - ): Unit diff --git a/aws/src/main/scala/com/clairvoyant/data/scalaxy/writer/aws/s3/instances/DataFrameToXMLFileWriter.scala b/aws/src/main/scala/com/clairvoyant/data/scalaxy/writer/aws/s3/instances/DataFrameToXMLFileWriter.scala index e4edae0..873de00 100644 --- a/aws/src/main/scala/com/clairvoyant/data/scalaxy/writer/aws/s3/instances/DataFrameToXMLFileWriter.scala +++ b/aws/src/main/scala/com/clairvoyant/data/scalaxy/writer/aws/s3/instances/DataFrameToXMLFileWriter.scala @@ -1,9 +1,10 @@ package com.clairvoyant.data.scalaxy.writer.aws.s3.instances +import com.clairvoyant.data.scalaxy.writer.aws.s3.DataFrameToS3BucketWriter import com.clairvoyant.data.scalaxy.writer.aws.s3.formats.XMLFileFormat import org.apache.spark.sql.{DataFrame, SaveMode} -implicit object DataFrameToXMLFileWriter extends DataFrameToS3FileWriter[XMLFileFormat] { +implicit object DataFrameToXMLFileWriter extends DataFrameToS3BucketWriter[XMLFileFormat] { import com.databricks.spark.xml.* diff --git a/aws/src/test/scala/com/clairvoyant/data/scalaxy/writer/aws/s3/DataFrameToCSVS3BucketWriterSpec.scala b/aws/src/test/scala/com/clairvoyant/data/scalaxy/writer/aws/s3/DataFrameToCSVS3BucketWriterSpec.scala index 71ea671..e9ab0d4 100644 --- a/aws/src/test/scala/com/clairvoyant/data/scalaxy/writer/aws/s3/DataFrameToCSVS3BucketWriterSpec.scala +++ b/aws/src/test/scala/com/clairvoyant/data/scalaxy/writer/aws/s3/DataFrameToCSVS3BucketWriterSpec.scala @@ -10,6 +10,8 @@ class DataFrameToCSVS3BucketWriterSpec extends DataFrameReader with DataFrameMat val outputDirPath = s"/tmp/out_${System.currentTimeMillis()}" + val dataFrameToS3BucketWriter = DataFrameToS3BucketWriter[CSVFileFormat] + "write()" should "write a dataframe to the provided s3 path" in { val df = readJSONFromText( """|{ @@ -25,8 +27,8 @@ class DataFrameToCSVS3BucketWriterSpec extends DataFrameReader with DataFrameMat s3Client.createBucket(bucketName) - DataFrameToS3BucketWriter - .write[CSVFileFormat]( + dataFrameToS3BucketWriter + .write( dataFrame = df, fileFormat = csvFileFormat, bucketName = bucketName, diff --git a/aws/src/test/scala/com/clairvoyant/data/scalaxy/writer/aws/s3/DataFrameToJSONS3BucketWriterSpec.scala b/aws/src/test/scala/com/clairvoyant/data/scalaxy/writer/aws/s3/DataFrameToJSONS3BucketWriterSpec.scala index e8bca17..ffb11c7 100644 --- a/aws/src/test/scala/com/clairvoyant/data/scalaxy/writer/aws/s3/DataFrameToJSONS3BucketWriterSpec.scala +++ b/aws/src/test/scala/com/clairvoyant/data/scalaxy/writer/aws/s3/DataFrameToJSONS3BucketWriterSpec.scala @@ -10,6 +10,8 @@ class DataFrameToJSONS3BucketWriterSpec extends DataFrameReader with DataFrameMa val outputDirPath = s"/tmp/out_${System.currentTimeMillis()}" + val dataFrameToS3BucketWriter = DataFrameToS3BucketWriter[JSONFileFormat] + "write()" should "write a dataframe to the provided s3 path" in { val df = readJSONFromText( """|{ @@ -25,8 +27,8 @@ class DataFrameToJSONS3BucketWriterSpec extends DataFrameReader with DataFrameMa s3Client.createBucket(bucketName) - DataFrameToS3BucketWriter - .write[JSONFileFormat]( + dataFrameToS3BucketWriter + .write( dataFrame = df, fileFormat = jsonFileFormat, bucketName = bucketName, diff --git a/aws/src/test/scala/com/clairvoyant/data/scalaxy/writer/aws/s3/DataFrameToParquetS3BucketWriterSpec.scala b/aws/src/test/scala/com/clairvoyant/data/scalaxy/writer/aws/s3/DataFrameToParquetS3BucketWriterSpec.scala index 417d6fb..ffe2c80 100644 --- a/aws/src/test/scala/com/clairvoyant/data/scalaxy/writer/aws/s3/DataFrameToParquetS3BucketWriterSpec.scala +++ b/aws/src/test/scala/com/clairvoyant/data/scalaxy/writer/aws/s3/DataFrameToParquetS3BucketWriterSpec.scala @@ -10,6 +10,8 @@ class DataFrameToParquetS3BucketWriterSpec extends DataFrameReader with DataFram val outputDirPath = s"/tmp/out_${System.currentTimeMillis()}" + val dataFrameToS3BucketWriter = DataFrameToS3BucketWriter[ParquetFileFormat] + "write()" should "write a dataframe to the provided s3 path" in { val df = readJSONFromText( """|{ @@ -25,8 +27,8 @@ class DataFrameToParquetS3BucketWriterSpec extends DataFrameReader with DataFram s3Client.createBucket(bucketName) - DataFrameToS3BucketWriter - .write[ParquetFileFormat]( + dataFrameToS3BucketWriter + .write( dataFrame = df, fileFormat = parquetFileFormat, bucketName = bucketName, diff --git a/aws/src/test/scala/com/clairvoyant/data/scalaxy/writer/aws/s3/DataFrameToXMLS3BucketWriterSpec.scala b/aws/src/test/scala/com/clairvoyant/data/scalaxy/writer/aws/s3/DataFrameToXMLS3BucketWriterSpec.scala index 1406e58..620b1e6 100644 --- a/aws/src/test/scala/com/clairvoyant/data/scalaxy/writer/aws/s3/DataFrameToXMLS3BucketWriterSpec.scala +++ b/aws/src/test/scala/com/clairvoyant/data/scalaxy/writer/aws/s3/DataFrameToXMLS3BucketWriterSpec.scala @@ -10,6 +10,8 @@ class DataFrameToXMLS3BucketWriterSpec extends DataFrameReader with DataFrameMat val outputDirPath = s"/tmp/out_${System.currentTimeMillis()}" + val dataFrameToS3BucketWriter = DataFrameToS3BucketWriter[XMLFileFormat] + "write()" should "write a dataframe to the provided s3 path" in { val df = readJSONFromText( """|{ @@ -25,8 +27,8 @@ class DataFrameToXMLS3BucketWriterSpec extends DataFrameReader with DataFrameMat s3Client.createBucket(bucketName) - DataFrameToS3BucketWriter - .write[XMLFileFormat]( + dataFrameToS3BucketWriter + .write( dataFrame = df, fileFormat = xmlFileFormat, bucketName = bucketName, diff --git a/gcp/src/main/scala/com/clairvoyant/data/scalaxy/writer/gcp/bigquery/DataFrameToBigQueryWriter.scala b/gcp/src/main/scala/com/clairvoyant/data/scalaxy/writer/gcp/bigquery/DataFrameToBigQueryWriter.scala index 33d22ba..d4f5612 100644 --- a/gcp/src/main/scala/com/clairvoyant/data/scalaxy/writer/gcp/bigquery/DataFrameToBigQueryWriter.scala +++ b/gcp/src/main/scala/com/clairvoyant/data/scalaxy/writer/gcp/bigquery/DataFrameToBigQueryWriter.scala @@ -1,12 +1,11 @@ package com.clairvoyant.data.scalaxy.writer.gcp.bigquery -import com.clairvoyant.data.scalaxy.writer.gcp.bigquery.instances.DataFrameToBQWriter import com.clairvoyant.data.scalaxy.writer.gcp.bigquery.types.BigQueryWriterType import org.apache.spark.sql.{DataFrame, SaveMode} -object DataFrameToBigQueryWriter { +trait DataFrameToBigQueryWriter[T]: - def write[T <: BigQueryWriterType]( + def write( dataFrame: DataFrame, table: String, dataset: Option[String] = None, @@ -14,15 +13,10 @@ object DataFrameToBigQueryWriter { parentProject: Option[String] = None, saveMode: SaveMode = SaveMode.Overwrite, writerType: T - )(using dataFrameToBQWriter: DataFrameToBQWriter[T]): Unit = - dataFrameToBQWriter.write( - dataFrame, - table, - dataset, - project, - parentProject, - saveMode, - writerType - ) + ): Unit -} +object DataFrameToBigQueryWriter: + + def apply[T <: BigQueryWriterType]( + using dataFrameToBigQueryWriter: DataFrameToBigQueryWriter[T] + ): DataFrameToBigQueryWriter[T] = dataFrameToBigQueryWriter diff --git a/gcp/src/main/scala/com/clairvoyant/data/scalaxy/writer/gcp/bigquery/instances/DataFrameToBQWriter.scala b/gcp/src/main/scala/com/clairvoyant/data/scalaxy/writer/gcp/bigquery/instances/DataFrameToBQWriter.scala deleted file mode 100644 index f9cb3a2..0000000 --- a/gcp/src/main/scala/com/clairvoyant/data/scalaxy/writer/gcp/bigquery/instances/DataFrameToBQWriter.scala +++ /dev/null @@ -1,15 +0,0 @@ -package com.clairvoyant.data.scalaxy.writer.gcp.bigquery.instances - -import org.apache.spark.sql.{DataFrame, SaveMode} - -trait DataFrameToBQWriter[T]: - - def write( - dataFrame: DataFrame, - table: String, - dataset: Option[String], - project: Option[String], - parentProject: Option[String], - saveMode: SaveMode = SaveMode.Overwrite, - writerType: T - ): Unit diff --git a/gcp/src/main/scala/com/clairvoyant/data/scalaxy/writer/gcp/bigquery/instances/DataFrameToDirectBQWriter.scala b/gcp/src/main/scala/com/clairvoyant/data/scalaxy/writer/gcp/bigquery/instances/DataFrameToDirectBQWriter.scala index 49390d9..fe346b3 100644 --- a/gcp/src/main/scala/com/clairvoyant/data/scalaxy/writer/gcp/bigquery/instances/DataFrameToDirectBQWriter.scala +++ b/gcp/src/main/scala/com/clairvoyant/data/scalaxy/writer/gcp/bigquery/instances/DataFrameToDirectBQWriter.scala @@ -1,9 +1,10 @@ package com.clairvoyant.data.scalaxy.writer.gcp.bigquery.instances +import com.clairvoyant.data.scalaxy.writer.gcp.bigquery.DataFrameToBigQueryWriter import com.clairvoyant.data.scalaxy.writer.gcp.bigquery.types.DirectBigQueryWriterType import org.apache.spark.sql.{DataFrame, SaveMode} -implicit object DataFrameToDirectBQWriter extends DataFrameToBQWriter[DirectBigQueryWriterType] { +implicit object DataFrameToDirectBQWriter extends DataFrameToBigQueryWriter[DirectBigQueryWriterType] { override def write( dataFrame: DataFrame, diff --git a/gcp/src/main/scala/com/clairvoyant/data/scalaxy/writer/gcp/bigquery/instances/DataFrameToIndirectBQWriter.scala b/gcp/src/main/scala/com/clairvoyant/data/scalaxy/writer/gcp/bigquery/instances/DataFrameToIndirectBQWriter.scala index bc13272..cf089be 100644 --- a/gcp/src/main/scala/com/clairvoyant/data/scalaxy/writer/gcp/bigquery/instances/DataFrameToIndirectBQWriter.scala +++ b/gcp/src/main/scala/com/clairvoyant/data/scalaxy/writer/gcp/bigquery/instances/DataFrameToIndirectBQWriter.scala @@ -1,9 +1,10 @@ package com.clairvoyant.data.scalaxy.writer.gcp.bigquery.instances +import com.clairvoyant.data.scalaxy.writer.gcp.bigquery.DataFrameToBigQueryWriter import com.clairvoyant.data.scalaxy.writer.gcp.bigquery.types.IndirectBigQueryWriterType import org.apache.spark.sql.{DataFrame, SaveMode} -implicit object DataFrameToIndirectBQWriter extends DataFrameToBQWriter[IndirectBigQueryWriterType] { +implicit object DataFrameToIndirectBQWriter extends DataFrameToBigQueryWriter[IndirectBigQueryWriterType] { override def write( dataFrame: DataFrame, diff --git a/gcp/src/main/scala/com/clairvoyant/data/scalaxy/writer/gcp/gcs/DataFrameToGCSBucketWriter.scala b/gcp/src/main/scala/com/clairvoyant/data/scalaxy/writer/gcp/gcs/DataFrameToGCSBucketWriter.scala index ce94cee..31d08fe 100644 --- a/gcp/src/main/scala/com/clairvoyant/data/scalaxy/writer/gcp/gcs/DataFrameToGCSBucketWriter.scala +++ b/gcp/src/main/scala/com/clairvoyant/data/scalaxy/writer/gcp/gcs/DataFrameToGCSBucketWriter.scala @@ -1,18 +1,20 @@ package com.clairvoyant.data.scalaxy.writer.gcp.gcs import com.clairvoyant.data.scalaxy.writer.gcp.gcs.formats.FileFormat -import com.clairvoyant.data.scalaxy.writer.gcp.gcs.instances.DataFrameToGCSFileWriter import org.apache.spark.sql.{DataFrame, SaveMode} -object DataFrameToGCSBucketWriter { +trait DataFrameToGCSBucketWriter[T]: - def write[T <: FileFormat]( + def write( dataFrame: DataFrame, fileFormat: T, bucketName: String, path: String, saveMode: SaveMode = SaveMode.Overwrite - )(using dataFrameToGCSFileWriter: DataFrameToGCSFileWriter[T]): Unit = - dataFrameToGCSFileWriter.write(dataFrame, fileFormat, bucketName, path, saveMode) + ): Unit -} +object DataFrameToGCSBucketWriter: + + def apply[T <: FileFormat]( + using dataFrameToGCSBucketWriter: DataFrameToGCSBucketWriter[T] + ): DataFrameToGCSBucketWriter[T] = dataFrameToGCSBucketWriter diff --git a/gcp/src/main/scala/com/clairvoyant/data/scalaxy/writer/gcp/gcs/instances/DataFrameToCSVFileWriter.scala b/gcp/src/main/scala/com/clairvoyant/data/scalaxy/writer/gcp/gcs/instances/DataFrameToCSVFileWriter.scala index b1029c5..b7ec4aa 100644 --- a/gcp/src/main/scala/com/clairvoyant/data/scalaxy/writer/gcp/gcs/instances/DataFrameToCSVFileWriter.scala +++ b/gcp/src/main/scala/com/clairvoyant/data/scalaxy/writer/gcp/gcs/instances/DataFrameToCSVFileWriter.scala @@ -1,9 +1,10 @@ package com.clairvoyant.data.scalaxy.writer.gcp.gcs.instances +import com.clairvoyant.data.scalaxy.writer.gcp.gcs.DataFrameToGCSBucketWriter import com.clairvoyant.data.scalaxy.writer.gcp.gcs.formats.CSVFileFormat import org.apache.spark.sql.{DataFrame, SaveMode} -implicit object DataFrameToCSVFileWriter extends DataFrameToGCSFileWriter[CSVFileFormat] { +implicit object DataFrameToCSVFileWriter extends DataFrameToGCSBucketWriter[CSVFileFormat] { import org.apache.spark.sql.catalyst.csv.CSVOptions.* diff --git a/gcp/src/main/scala/com/clairvoyant/data/scalaxy/writer/gcp/gcs/instances/DataFrameToGCSFileWriter.scala b/gcp/src/main/scala/com/clairvoyant/data/scalaxy/writer/gcp/gcs/instances/DataFrameToGCSFileWriter.scala deleted file mode 100644 index e270718..0000000 --- a/gcp/src/main/scala/com/clairvoyant/data/scalaxy/writer/gcp/gcs/instances/DataFrameToGCSFileWriter.scala +++ /dev/null @@ -1,13 +0,0 @@ -package com.clairvoyant.data.scalaxy.writer.gcp.gcs.instances - -import org.apache.spark.sql.{DataFrame, SaveMode} - -trait DataFrameToGCSFileWriter[T]: - - def write( - dataFrame: DataFrame, - fileFormat: T, - bucketName: String, - path: String, - saveMode: SaveMode - ): Unit diff --git a/gcp/src/main/scala/com/clairvoyant/data/scalaxy/writer/gcp/gcs/instances/DataFrameToJSONFileWriter.scala b/gcp/src/main/scala/com/clairvoyant/data/scalaxy/writer/gcp/gcs/instances/DataFrameToJSONFileWriter.scala index d44b747..79254fd 100644 --- a/gcp/src/main/scala/com/clairvoyant/data/scalaxy/writer/gcp/gcs/instances/DataFrameToJSONFileWriter.scala +++ b/gcp/src/main/scala/com/clairvoyant/data/scalaxy/writer/gcp/gcs/instances/DataFrameToJSONFileWriter.scala @@ -1,9 +1,10 @@ package com.clairvoyant.data.scalaxy.writer.gcp.gcs.instances +import com.clairvoyant.data.scalaxy.writer.gcp.gcs.DataFrameToGCSBucketWriter import com.clairvoyant.data.scalaxy.writer.gcp.gcs.formats.JSONFileFormat import org.apache.spark.sql.{DataFrame, SaveMode} -implicit object DataFrameToJSONFileWriter extends DataFrameToGCSFileWriter[JSONFileFormat] { +implicit object DataFrameToJSONFileWriter extends DataFrameToGCSBucketWriter[JSONFileFormat] { import org.apache.spark.sql.catalyst.json.JSONOptions.* diff --git a/gcp/src/main/scala/com/clairvoyant/data/scalaxy/writer/gcp/gcs/instances/DataFrameToParquetFileWriter.scala b/gcp/src/main/scala/com/clairvoyant/data/scalaxy/writer/gcp/gcs/instances/DataFrameToParquetFileWriter.scala index a527f47..dc277b4 100644 --- a/gcp/src/main/scala/com/clairvoyant/data/scalaxy/writer/gcp/gcs/instances/DataFrameToParquetFileWriter.scala +++ b/gcp/src/main/scala/com/clairvoyant/data/scalaxy/writer/gcp/gcs/instances/DataFrameToParquetFileWriter.scala @@ -1,9 +1,10 @@ package com.clairvoyant.data.scalaxy.writer.gcp.gcs.instances +import com.clairvoyant.data.scalaxy.writer.gcp.gcs.DataFrameToGCSBucketWriter import com.clairvoyant.data.scalaxy.writer.gcp.gcs.formats.ParquetFileFormat import org.apache.spark.sql.{DataFrame, SaveMode} -implicit object DataFrameToParquetFileWriter extends DataFrameToGCSFileWriter[ParquetFileFormat] { +implicit object DataFrameToParquetFileWriter extends DataFrameToGCSBucketWriter[ParquetFileFormat] { override def write( dataFrame: DataFrame, diff --git a/gcp/src/main/scala/com/clairvoyant/data/scalaxy/writer/gcp/gcs/instances/DataFrameToXMLFileWriter.scala b/gcp/src/main/scala/com/clairvoyant/data/scalaxy/writer/gcp/gcs/instances/DataFrameToXMLFileWriter.scala index d1ac893..a7a0ad4 100644 --- a/gcp/src/main/scala/com/clairvoyant/data/scalaxy/writer/gcp/gcs/instances/DataFrameToXMLFileWriter.scala +++ b/gcp/src/main/scala/com/clairvoyant/data/scalaxy/writer/gcp/gcs/instances/DataFrameToXMLFileWriter.scala @@ -1,9 +1,10 @@ package com.clairvoyant.data.scalaxy.writer.gcp.gcs.instances +import com.clairvoyant.data.scalaxy.writer.gcp.gcs.DataFrameToGCSBucketWriter import com.clairvoyant.data.scalaxy.writer.gcp.gcs.formats.XMLFileFormat import org.apache.spark.sql.{DataFrame, SaveMode} -implicit object DataFrameToXMLFileWriter extends DataFrameToGCSFileWriter[XMLFileFormat] { +implicit object DataFrameToXMLFileWriter extends DataFrameToGCSBucketWriter[XMLFileFormat] { import com.databricks.spark.xml.* diff --git a/local-file-system/src/main/scala/com/clairvoyant/data/scalaxy/writer/local/file/DataFrameToLocalFileSystemWriter.scala b/local-file-system/src/main/scala/com/clairvoyant/data/scalaxy/writer/local/file/DataFrameToLocalFileSystemWriter.scala index 7eab569..14d132a 100644 --- a/local-file-system/src/main/scala/com/clairvoyant/data/scalaxy/writer/local/file/DataFrameToLocalFileSystemWriter.scala +++ b/local-file-system/src/main/scala/com/clairvoyant/data/scalaxy/writer/local/file/DataFrameToLocalFileSystemWriter.scala @@ -1,17 +1,19 @@ package com.clairvoyant.data.scalaxy.writer.local.file import com.clairvoyant.data.scalaxy.writer.local.file.formats.FileFormat -import com.clairvoyant.data.scalaxy.writer.local.file.instances.DataFrameToFileWriter import org.apache.spark.sql.{DataFrame, SaveMode} -object DataFrameToLocalFileSystemWriter { +trait DataFrameToLocalFileSystemWriter[T]: - def write[T <: FileFormat]( + def write( dataFrame: DataFrame, fileFormat: T, path: String, saveMode: SaveMode = SaveMode.Overwrite - )(using dataFrameToFileWriter: DataFrameToFileWriter[T]): Unit = - dataFrameToFileWriter.write(dataFrame, fileFormat, path, saveMode) + ): Unit -} +object DataFrameToLocalFileSystemWriter: + + def apply[T <: FileFormat]( + using dataFrameToLocalFileSystemWriter: DataFrameToLocalFileSystemWriter[T] + ): DataFrameToLocalFileSystemWriter[T] = dataFrameToLocalFileSystemWriter diff --git a/local-file-system/src/main/scala/com/clairvoyant/data/scalaxy/writer/local/file/instances/DataFrameToCSVFileWriter.scala b/local-file-system/src/main/scala/com/clairvoyant/data/scalaxy/writer/local/file/instances/DataFrameToCSVFileWriter.scala index 11a11bb..061f302 100644 --- a/local-file-system/src/main/scala/com/clairvoyant/data/scalaxy/writer/local/file/instances/DataFrameToCSVFileWriter.scala +++ b/local-file-system/src/main/scala/com/clairvoyant/data/scalaxy/writer/local/file/instances/DataFrameToCSVFileWriter.scala @@ -1,9 +1,10 @@ package com.clairvoyant.data.scalaxy.writer.local.file.instances +import com.clairvoyant.data.scalaxy.writer.local.file.DataFrameToLocalFileSystemWriter import com.clairvoyant.data.scalaxy.writer.local.file.formats.CSVFileFormat import org.apache.spark.sql.{DataFrame, SaveMode} -implicit object DataFrameToCSVFileWriter extends DataFrameToFileWriter[CSVFileFormat] { +implicit object DataFrameToCSVFileWriter extends DataFrameToLocalFileSystemWriter[CSVFileFormat] { import org.apache.spark.sql.catalyst.csv.CSVOptions.* diff --git a/local-file-system/src/main/scala/com/clairvoyant/data/scalaxy/writer/local/file/instances/DataFrameToFileWriter.scala b/local-file-system/src/main/scala/com/clairvoyant/data/scalaxy/writer/local/file/instances/DataFrameToFileWriter.scala deleted file mode 100644 index 3dc6231..0000000 --- a/local-file-system/src/main/scala/com/clairvoyant/data/scalaxy/writer/local/file/instances/DataFrameToFileWriter.scala +++ /dev/null @@ -1,7 +0,0 @@ -package com.clairvoyant.data.scalaxy.writer.local.file.instances - -import org.apache.spark.sql.{DataFrame, SaveMode} - -trait DataFrameToFileWriter[T]: - - def write(dataFrame: DataFrame, fileFormat: T, path: String, saveMode: SaveMode): Unit diff --git a/local-file-system/src/main/scala/com/clairvoyant/data/scalaxy/writer/local/file/instances/DataFrameToJSONFileWriter.scala b/local-file-system/src/main/scala/com/clairvoyant/data/scalaxy/writer/local/file/instances/DataFrameToJSONFileWriter.scala index be9d7ee..3e553a8 100644 --- a/local-file-system/src/main/scala/com/clairvoyant/data/scalaxy/writer/local/file/instances/DataFrameToJSONFileWriter.scala +++ b/local-file-system/src/main/scala/com/clairvoyant/data/scalaxy/writer/local/file/instances/DataFrameToJSONFileWriter.scala @@ -1,9 +1,10 @@ package com.clairvoyant.data.scalaxy.writer.local.file.instances +import com.clairvoyant.data.scalaxy.writer.local.file.DataFrameToLocalFileSystemWriter import com.clairvoyant.data.scalaxy.writer.local.file.formats.JSONFileFormat import org.apache.spark.sql.{DataFrame, SaveMode} -implicit object DataFrameToJSONFileWriter extends DataFrameToFileWriter[JSONFileFormat] { +implicit object DataFrameToJSONFileWriter extends DataFrameToLocalFileSystemWriter[JSONFileFormat] { import org.apache.spark.sql.catalyst.json.JSONOptions.* diff --git a/local-file-system/src/main/scala/com/clairvoyant/data/scalaxy/writer/local/file/instances/DataFrameToParquetFileWriter.scala b/local-file-system/src/main/scala/com/clairvoyant/data/scalaxy/writer/local/file/instances/DataFrameToParquetFileWriter.scala index 0b6016b..cdd9fde 100644 --- a/local-file-system/src/main/scala/com/clairvoyant/data/scalaxy/writer/local/file/instances/DataFrameToParquetFileWriter.scala +++ b/local-file-system/src/main/scala/com/clairvoyant/data/scalaxy/writer/local/file/instances/DataFrameToParquetFileWriter.scala @@ -1,9 +1,10 @@ package com.clairvoyant.data.scalaxy.writer.local.file.instances +import com.clairvoyant.data.scalaxy.writer.local.file.DataFrameToLocalFileSystemWriter import com.clairvoyant.data.scalaxy.writer.local.file.formats.ParquetFileFormat import org.apache.spark.sql.{DataFrame, SaveMode} -implicit object DataFrameToParquetFileWriter extends DataFrameToFileWriter[ParquetFileFormat] { +implicit object DataFrameToParquetFileWriter extends DataFrameToLocalFileSystemWriter[ParquetFileFormat] { override def write(dataFrame: DataFrame, fileFormat: ParquetFileFormat, path: String, saveMode: SaveMode): Unit = dataFrame.write diff --git a/local-file-system/src/main/scala/com/clairvoyant/data/scalaxy/writer/local/file/instances/DataFrameToXMLFileWriter.scala b/local-file-system/src/main/scala/com/clairvoyant/data/scalaxy/writer/local/file/instances/DataFrameToXMLFileWriter.scala index 2b92d27..deb7628 100644 --- a/local-file-system/src/main/scala/com/clairvoyant/data/scalaxy/writer/local/file/instances/DataFrameToXMLFileWriter.scala +++ b/local-file-system/src/main/scala/com/clairvoyant/data/scalaxy/writer/local/file/instances/DataFrameToXMLFileWriter.scala @@ -1,9 +1,10 @@ package com.clairvoyant.data.scalaxy.writer.local.file.instances +import com.clairvoyant.data.scalaxy.writer.local.file.DataFrameToLocalFileSystemWriter import com.clairvoyant.data.scalaxy.writer.local.file.formats.XMLFileFormat import org.apache.spark.sql.{DataFrame, SaveMode} -implicit object DataFrameToXMLFileWriter extends DataFrameToFileWriter[XMLFileFormat] { +implicit object DataFrameToXMLFileWriter extends DataFrameToLocalFileSystemWriter[XMLFileFormat] { import com.databricks.spark.xml.* diff --git a/local-file-system/src/test/scala/com/clairvoyant/data/scalaxy/writer/local/file/DataFrameToCSVLocalFileSystemWriterSpec.scala b/local-file-system/src/test/scala/com/clairvoyant/data/scalaxy/writer/local/file/DataFrameToCSVLocalFileSystemWriterSpec.scala index f3e918c..fd74587 100644 --- a/local-file-system/src/test/scala/com/clairvoyant/data/scalaxy/writer/local/file/DataFrameToCSVLocalFileSystemWriterSpec.scala +++ b/local-file-system/src/test/scala/com/clairvoyant/data/scalaxy/writer/local/file/DataFrameToCSVLocalFileSystemWriterSpec.scala @@ -13,6 +13,8 @@ class DataFrameToCSVLocalFileSystemWriterSpec extends DataFrameReader with DataF val outputDirPath = s"/tmp/out_${System.currentTimeMillis()}" + val dataFrameToLocalFileSystemWriter = DataFrameToLocalFileSystemWriter[CSVFileFormat] + "write()" should "write a dataframe to the provided path" in { val df = readJSONFromText( """|{ @@ -24,7 +26,7 @@ class DataFrameToCSVLocalFileSystemWriterSpec extends DataFrameReader with DataF val csvFileFormat = CSVFileFormat() - DataFrameToLocalFileSystemWriter + dataFrameToLocalFileSystemWriter .write( dataFrame = df, fileFormat = csvFileFormat, @@ -54,7 +56,7 @@ class DataFrameToCSVLocalFileSystemWriterSpec extends DataFrameReader with DataF emptyValue = "NA" ) - DataFrameToLocalFileSystemWriter + dataFrameToLocalFileSystemWriter .write( dataFrame = df, fileFormat = csvFileFormat, @@ -84,7 +86,7 @@ class DataFrameToCSVLocalFileSystemWriterSpec extends DataFrameReader with DataF header = false ) - DataFrameToLocalFileSystemWriter + dataFrameToLocalFileSystemWriter .write( dataFrame = df, fileFormat = csvFileFormat, @@ -117,7 +119,7 @@ class DataFrameToCSVLocalFileSystemWriterSpec extends DataFrameReader with DataF ignoreLeadingWhiteSpace = false ) - DataFrameToLocalFileSystemWriter + dataFrameToLocalFileSystemWriter .write( dataFrame = df, fileFormat = csvFileFormat, @@ -147,7 +149,7 @@ class DataFrameToCSVLocalFileSystemWriterSpec extends DataFrameReader with DataF ignoreLeadingWhiteSpace = true ) - DataFrameToLocalFileSystemWriter + dataFrameToLocalFileSystemWriter .write( dataFrame = df, fileFormat = csvFileFormat, @@ -177,7 +179,7 @@ class DataFrameToCSVLocalFileSystemWriterSpec extends DataFrameReader with DataF ignoreTrailingWhiteSpace = false ) - DataFrameToLocalFileSystemWriter + dataFrameToLocalFileSystemWriter .write( dataFrame = df, fileFormat = csvFileFormat, @@ -207,7 +209,7 @@ class DataFrameToCSVLocalFileSystemWriterSpec extends DataFrameReader with DataF ignoreTrailingWhiteSpace = true ) - DataFrameToLocalFileSystemWriter + dataFrameToLocalFileSystemWriter .write( dataFrame = df, fileFormat = csvFileFormat, @@ -237,7 +239,7 @@ class DataFrameToCSVLocalFileSystemWriterSpec extends DataFrameReader with DataF lineSep = "#" ) - DataFrameToLocalFileSystemWriter + dataFrameToLocalFileSystemWriter .write( dataFrame = df, fileFormat = csvFileFormat, @@ -270,7 +272,7 @@ class DataFrameToCSVLocalFileSystemWriterSpec extends DataFrameReader with DataF nullValue = "Invalid" ) - DataFrameToLocalFileSystemWriter + dataFrameToLocalFileSystemWriter .write( dataFrame = df, fileFormat = csvFileFormat, @@ -300,7 +302,7 @@ class DataFrameToCSVLocalFileSystemWriterSpec extends DataFrameReader with DataF sep = ";" ) - DataFrameToLocalFileSystemWriter + dataFrameToLocalFileSystemWriter .write( dataFrame = df, fileFormat = csvFileFormat, diff --git a/local-file-system/src/test/scala/com/clairvoyant/data/scalaxy/writer/local/file/DataFrameToJSONLocalFileSystemWriterSpec.scala b/local-file-system/src/test/scala/com/clairvoyant/data/scalaxy/writer/local/file/DataFrameToJSONLocalFileSystemWriterSpec.scala index a612c29..0250907 100644 --- a/local-file-system/src/test/scala/com/clairvoyant/data/scalaxy/writer/local/file/DataFrameToJSONLocalFileSystemWriterSpec.scala +++ b/local-file-system/src/test/scala/com/clairvoyant/data/scalaxy/writer/local/file/DataFrameToJSONLocalFileSystemWriterSpec.scala @@ -13,6 +13,8 @@ class DataFrameToJSONLocalFileSystemWriterSpec extends DataFrameReader with Data val outputDirPath = s"/tmp/out_${System.currentTimeMillis()}" + val dataFrameToLocalFileSystemWriter = DataFrameToLocalFileSystemWriter[JSONFileFormat] + "write()" should "write a dataframe to the provided path" in { val df = readJSONFromText( """|{ @@ -24,7 +26,7 @@ class DataFrameToJSONLocalFileSystemWriterSpec extends DataFrameReader with Data val jsonFileFormat = JSONFileFormat() - DataFrameToLocalFileSystemWriter + dataFrameToLocalFileSystemWriter .write( dataFrame = df, fileFormat = jsonFileFormat, @@ -50,7 +52,7 @@ class DataFrameToJSONLocalFileSystemWriterSpec extends DataFrameReader with Data ignoreNullFields = true ) - DataFrameToLocalFileSystemWriter + dataFrameToLocalFileSystemWriter .write( dataFrame = df, fileFormat = jsonFileFormat, diff --git a/local-file-system/src/test/scala/com/clairvoyant/data/scalaxy/writer/local/file/DataFrameToParquetLocalFileSystemWriterSpec.scala b/local-file-system/src/test/scala/com/clairvoyant/data/scalaxy/writer/local/file/DataFrameToParquetLocalFileSystemWriterSpec.scala index 0e14d5d..163aba7 100644 --- a/local-file-system/src/test/scala/com/clairvoyant/data/scalaxy/writer/local/file/DataFrameToParquetLocalFileSystemWriterSpec.scala +++ b/local-file-system/src/test/scala/com/clairvoyant/data/scalaxy/writer/local/file/DataFrameToParquetLocalFileSystemWriterSpec.scala @@ -13,6 +13,8 @@ class DataFrameToParquetLocalFileSystemWriterSpec val outputDirPath = s"/tmp/out_${System.currentTimeMillis()}" + val dataFrameToLocalFileSystemWriter = DataFrameToLocalFileSystemWriter[ParquetFileFormat] + "write()" should "write a dataframe to the provided path" in { val df = readJSONFromText( """|{ @@ -24,7 +26,7 @@ class DataFrameToParquetLocalFileSystemWriterSpec val parquetFileFormat = ParquetFileFormat() - DataFrameToLocalFileSystemWriter + dataFrameToLocalFileSystemWriter .write( dataFrame = df, fileFormat = parquetFileFormat, diff --git a/local-file-system/src/test/scala/com/clairvoyant/data/scalaxy/writer/local/file/DataFrameToXMLLocalFileSystemWriterSpec.scala b/local-file-system/src/test/scala/com/clairvoyant/data/scalaxy/writer/local/file/DataFrameToXMLLocalFileSystemWriterSpec.scala index ec21b2a..04e0b85 100644 --- a/local-file-system/src/test/scala/com/clairvoyant/data/scalaxy/writer/local/file/DataFrameToXMLLocalFileSystemWriterSpec.scala +++ b/local-file-system/src/test/scala/com/clairvoyant/data/scalaxy/writer/local/file/DataFrameToXMLLocalFileSystemWriterSpec.scala @@ -10,6 +10,8 @@ class DataFrameToXMLLocalFileSystemWriterSpec extends DataFrameReader with DataF val outputDirPath = s"/tmp/out_${System.currentTimeMillis()}" + val dataFrameToLocalFileSystemWriter = DataFrameToLocalFileSystemWriter[XMLFileFormat] + "write()" should "write a dataframe to the provided path" in { val df = readJSONFromText( """|{ @@ -21,7 +23,7 @@ class DataFrameToXMLLocalFileSystemWriterSpec extends DataFrameReader with DataF val xmlFileFormat = XMLFileFormat() - DataFrameToLocalFileSystemWriter + dataFrameToLocalFileSystemWriter .write( dataFrame = df, fileFormat = xmlFileFormat, @@ -58,7 +60,7 @@ class DataFrameToXMLLocalFileSystemWriterSpec extends DataFrameReader with DataF attributePrefix = "attr_" ) - DataFrameToLocalFileSystemWriter + dataFrameToLocalFileSystemWriter .write( dataFrame = df, fileFormat = xmlFileFormat, @@ -99,7 +101,7 @@ class DataFrameToXMLLocalFileSystemWriterSpec extends DataFrameReader with DataF declaration = "custom_declaration" ) - DataFrameToLocalFileSystemWriter + dataFrameToLocalFileSystemWriter .write( dataFrame = df, fileFormat = xmlFileFormat, @@ -135,7 +137,7 @@ class DataFrameToXMLLocalFileSystemWriterSpec extends DataFrameReader with DataF rowTag = "item" ) - DataFrameToLocalFileSystemWriter + dataFrameToLocalFileSystemWriter .write( dataFrame = df, fileFormat = xmlFileFormat, @@ -176,7 +178,7 @@ class DataFrameToXMLLocalFileSystemWriterSpec extends DataFrameReader with DataF rootTag = "items" ) - DataFrameToLocalFileSystemWriter + dataFrameToLocalFileSystemWriter .write( dataFrame = df, fileFormat = xmlFileFormat, @@ -223,7 +225,7 @@ class DataFrameToXMLLocalFileSystemWriterSpec extends DataFrameReader with DataF valueTag = "#VALUE" ) - DataFrameToLocalFileSystemWriter + dataFrameToLocalFileSystemWriter .write( dataFrame = df, fileFormat = xmlFileFormat, From 409ebc0c63748fa20cb9a6095094157f592a46ca Mon Sep 17 00:00:00 2001 From: rahulbhatia023 Date: Wed, 20 Dec 2023 16:23:14 -0500 Subject: [PATCH 2/2] Updated documentation --- aws/README.md | 125 +++++++++++++------ gcp/README.md | 241 +++++++++++++++++++++++------------- local-file-system/README.md | 191 ++++++++++++++++++---------- 3 files changed, 370 insertions(+), 187 deletions(-) diff --git a/aws/README.md b/aws/README.md index 001875c..efaf292 100644 --- a/aws/README.md +++ b/aws/README.md @@ -12,7 +12,7 @@ ThisBuild / credentials += Credentials( System.getenv("GITHUB_TOKEN") ) -ThisBuild / libraryDependencies += "com.clairvoyant.data.scalaxy" %% "writer-aws" % "1.0.0" +ThisBuild / libraryDependencies += "com.clairvoyant.data.scalaxy" %% "writer-aws" % "2.0.0" ``` Make sure you add `GITHUB_USERNAME` and `GITHUB_TOKEN` to the environment variables. @@ -22,6 +22,33 @@ Make sure you add `GITHUB_USERNAME` and `GITHUB_TOKEN` to the environment variab ## S3 BUCKET User can use this library to write/persist spark dataframe to s3 buckets in various file formats. + +### API + +The library provides below `write` API in type class `DataFrameToS3BucketWriter` in order to write spark +dataframe into S3 bucket. + +```scala +def write( + dataFrame: DataFrame, + fileFormat: T, + bucketName: String, + path: String, + saveMode: SaveMode = SaveMode.Overwrite + ): Unit +``` + +The `write` method takes below arguments: + +| Argument Name | Mandatory | Default Value | Description | +|:--------------|:---------:|:-------------:|:------------------------------------------------| +| dataFrame | Yes | - | Dataframe to write to s3 bucket. | +| fileFormat | Yes | - | `FileFormat` to use while writing to s3 bucket. | +| bucketName | Yes | - | S3 bucket name. | +| path | Yes | - | S3 path to write the dataframe. | +| saveMode | No | overwrite | Save mode to use while writing to s3 bucket. | + + Supported file formats are: * CSV @@ -29,12 +56,25 @@ Supported file formats are: * XML * Parquet + ### CSV Suppose user wants to write the dataframe `df` to s3 bucket `mybucket` under the path `outputPath` in the `csv` format. Then user need to perform below steps: -#### 1. Define file format +#### 1. Import type class + +```scala +import com.clairvoyant.data.scalaxy.writer.aws.DataFrameToS3BucketWriter +``` + +#### 2. Import type class instance + +```scala +import com.clairvoyant.data.scalaxy.writer.aws.s3.instances.DataFrameToCSVFileWriter +``` + +#### 3. Define file format ```scala import com.clairvoyant.data.scalaxy.writer.aws.s3.formats.CSVFileFormat @@ -66,16 +106,11 @@ User can provide below options to the `CSVFileFormat` instance: | timestampFormat | yyyy-MM-dd HH:mm:ss | Sets the string that indicates a timestamp format. | | timestampNTZFormat | yyyy-MM-dd'T'HH:mm:ss[.SSS] | Sets the string that indicates a timestamp without timezone format. | -#### 2. Import type class instance -```scala -import com.clairvoyant.data.scalaxy.writer.aws.s3.instances.DataFrameToCSVFileWriter -`````` - -#### 3. Call API +#### 4. Call API ```scala -DataFrameToS3BucketWriter +DataFrameToS3BucketWriter[CSVFileFormat] .write( dataFrame = df, fileFormat = csvFileFormat, @@ -90,7 +125,19 @@ Suppose user wants to write the dataframe `df` to the s3 bucket `myBucket` under format. Then user need to perform below steps: -#### 1. Define file format +#### 1. Import type class + +```scala +import com.clairvoyant.data.scalaxy.writer.aws.DataFrameToS3BucketWriter +``` + +#### 2. Import type class instance + +```scala +import com.clairvoyant.data.scalaxy.writer.aws.s3.instances.DataFrameToJSONFileWriter +``` + +#### 3. Define file format ```scala import com.clairvoyant.data.scalaxy.writer.aws.s3.formats.JSONFileFormat @@ -113,16 +160,10 @@ User can provide below options to the `JSONFileFormat` instance: | timestampNTZFormat | yyyy-MM-dd'T'HH:mm:ss[.SSS] | Sets the string that indicates a timestamp without timezone format. | | timezone | UTC | Sets the string that indicates a time zone ID to be used to format timestamps in the JSON datasources or partition values. | -#### 2. Import type class instance +#### 4. Call API ```scala -import com.clairvoyant.data.scalaxy.writer.aws.s3.instances.DataFrameToJSONFileWriter -`````` - -#### 3. Call API - -```scala -DataFrameToS3BucketWriter +DataFrameToS3BucketWriter[JSONFileFormat] .write( dataFrame = df, fileFormat = jsonFileFormat, @@ -136,7 +177,19 @@ DataFrameToS3BucketWriter Suppose user wants to write the dataframe `df` to s3 bucket `myBucket` under the path `outputPath` in the `xml` format. Then user need to perform below steps: -#### 1. Define file format +#### 1. Import type class + +```scala +import com.clairvoyant.data.scalaxy.writer.aws.DataFrameToS3BucketWriter +``` + +#### 2. Import type class instance + +```scala +import com.clairvoyant.data.scalaxy.writer.aws.s3.instances.DataFrameToXMLFileWriter +``` + +#### 3. Define file format ```scala import com.clairvoyant.data.scalaxy.writer.aws.s3.formats.XMLFileFormat @@ -161,16 +214,10 @@ User can provide below options to the `XMLFileFormat` instance: | timestampFormat | yyyy-MM-dd'T'HH:mm:ss[.SSS][XXX] | Controls the format used to write TimestampType format columns. | | valueTag | _VALUE | The tag used for the value when there are attributes in the element having no child. | -#### 2. Import type class instance - -```scala -import com.clairvoyant.data.scalaxy.writer.aws.s3.instances.DataFrameToXMLFileWriter -`````` - -#### 3. Call API +#### 4. Call API ```scala -DataFrameToS3BucketWriter +DataFrameToS3BucketWriter[XMLFileFormat] .write( dataFrame = df, fileFormat = xmlFileFormat, @@ -185,7 +232,19 @@ Suppose user wants to write the dataframe `df` to s3 bucket `myBucket` under the format. Then user need to perform below steps: -#### 1. Define file format +#### 1. Import type class + +```scala +import com.clairvoyant.data.scalaxy.writer.aws.DataFrameToS3BucketWriter +``` + +#### 2. Import type class instance + +```scala +import com.clairvoyant.data.scalaxy.writer.aws.s3.instances.DataFrameToParquetFileWriter +``` + +#### 3. Define file format ```scala import com.clairvoyant.data.scalaxy.writer.aws.s3.formats.ParquetFileFormat @@ -202,16 +261,10 @@ User can provide below options to the `ParquetFileFormat` instance: | mergeSchema | false | Sets whether we should merge schemas collected from all Parquet part-files. | | compression | snappy | Compression codec to use when saving to file. This can be one of the known case-insensitive shorten names (none, uncompressed, snappy, gzip, lzo, brotli, lz4, and zstd). | -#### 2. Import type class instance - -```scala -import com.clairvoyant.data.scalaxy.writer.aws.s3.instances.DataFrameToParquetFileWriter -`````` - -#### 3. Call API +#### 4. Call API ```scala -DataFrameToS3BucketWriter +DataFrameToS3BucketWriter[ParquetFileFormat] .write( dataFrame = df, fileFormat = parquetFileFormat, diff --git a/gcp/README.md b/gcp/README.md index a63f4cb..a64d603 100644 --- a/gcp/README.md +++ b/gcp/README.md @@ -12,7 +12,7 @@ ThisBuild / credentials += Credentials( System.getenv("GITHUB_TOKEN") ) -ThisBuild / libraryDependencies += "com.clairvoyant.data.scalaxy" %% "writer-gcp" % "1.0.0" +ThisBuild / libraryDependencies += "com.clairvoyant.data.scalaxy" %% "writer-gcp" % "2.0.0" ``` Make sure you add `GITHUB_USERNAME` and `GITHUB_TOKEN` to the environment variables. @@ -22,6 +22,31 @@ Make sure you add `GITHUB_USERNAME` and `GITHUB_TOKEN` to the environment variab ## GCS Bucket User can use this library to write/persist spark dataframe to gcs buckets in various file formats. + +### API + +The library provides below `write` method to write the dataframe to gcs bucket: + +```scala +def write( + dataFrame: DataFrame, + fileFormat: T, + bucketName: String, + path: String, + saveMode: SaveMode = SaveMode.Overwrite + ): Unit +``` + +The `write` method takes below parameters: + +| Parameter Name | Mandatory | Default Value | Description | +|:---------------|:---------:|:-------------:|:--------------------------------------------------------------------------------------------------------------| +| dataFrame | Yes | None | Spark dataframe to be written to gcs bucket. | +| fileFormat | Yes | None | The instance of file format type class. | +| bucketName | Yes | None | The name of gcs bucket where dataframe needs to be persisted. | +| path | Yes | None | The path inside the gcs bucket where dataframe needs to be persisted. | +| saveMode | No | Overwrite | Mode of writing; default is overwrite; can be avoided if writeDisposition/ createDisposition has been defined | + Supported file formats are: * CSV @@ -34,7 +59,19 @@ Supported file formats are: Suppose user wants to write the dataframe `df` to gcs bucket `mybucket` under the path `outputPath` in the `csv` format. Then user need to perform below steps: -#### 1. Define file format +#### 1. Import type class + +```scala +import com.clairvoyant.data.scalaxy.writer.gcp.gcs.DataFrameToGCSBucketWriter +``` + +#### 2. Import type class instance + +```scala +import com.clairvoyant.data.scalaxy.writer.gcp.gcs.instances.DataFrameToCSVFileWriter +``` + +#### 3. Define file format ```scala import com.clairvoyant.data.scalaxy.writer.gcp.gcs.formats.CSVFileFormat @@ -66,23 +103,17 @@ User can provide below options to the `CSVFileFormat` instance: | timestampFormat | yyyy-MM-dd HH:mm:ss | Sets the string that indicates a timestamp format. | | timestampNTZFormat | yyyy-MM-dd'T'HH:mm:ss[.SSS] | Sets the string that indicates a timestamp without timezone format. | -#### 2. Import type class instance - -```scala -import com.clairvoyant.data.scalaxy.writer.gcp.gcs.instances.DataFrameToCSVFileWriter -`````` - -#### 3. Call API +#### 4. Call API ```scala -DataFrameToGCSBucketWriter +DataFrameToGCSBucketWriter[CSVFileFormat] .write( dataFrame = df, fileFormat = csvFileFormat, bucketName = mybucket, path = outputPath ) -`````` +``` ### JSON @@ -90,7 +121,19 @@ Suppose user wants to write the dataframe `df` to the gcs bucket `myBucket` unde format. Then user need to perform below steps: -#### 1. Define file format +#### 1. Import type class + +```scala +import com.clairvoyant.data.scalaxy.writer.gcp.gcs.DataFrameToGCSBucketWriter +``` + +#### 2. Import type class instance + +```scala +import com.clairvoyant.data.scalaxy.writer.gcp.gcs.instances.DataFrameToJSONFileWriter +``` + +#### 3. Define file format ```scala import com.clairvoyant.data.scalaxy.writer.gcp.gcs.formats.JSONFileFormat @@ -113,30 +156,36 @@ User can provide below options to the `JSONFileFormat` instance: | timestampNTZFormat | yyyy-MM-dd'T'HH:mm:ss[.SSS] | Sets the string that indicates a timestamp without timezone format. | | timezone | UTC | Sets the string that indicates a time zone ID to be used to format timestamps in the JSON datasources or partition values. | -#### 2. Import type class instance - -```scala -import com.clairvoyant.data.scalaxy.writer.gcp.gcs.instances.DataFrameToJSONFileWriter -`````` - -#### 3. Call API +#### 4. Call API ```scala -DataFrameToGCSBucketWriter +DataFrameToGCSBucketWriter[JSONFileFormat] .write( dataFrame = df, fileFormat = jsonFileFormat, bucketName = myBucket, path = outputPath ) -`````` +``` ### XML Suppose user wants to write the dataframe `df` to gcs bucket `myBucket` under the path `outputPath` in the `xml` format. Then user need to perform below steps: -#### 1. Define file format +#### 1. Import type class + +```scala +import com.clairvoyant.data.scalaxy.writer.gcp.gcs.DataFrameToGCSBucketWriter +``` + +#### 2. Import type class instance + +```scala +import com.clairvoyant.data.scalaxy.writer.gcp.gcs.instances.DataFrameToXMLFileWriter +``` + +#### 3. Define file format ```scala import com.clairvoyant.data.scalaxy.writer.gcp.gcs.formats.XMLFileFormat @@ -161,23 +210,17 @@ User can provide below options to the `XMLFileFormat` instance: | timestampFormat | yyyy-MM-dd'T'HH:mm:ss[.SSS][XXX] | Controls the format used to write TimestampType format columns. | | valueTag | _VALUE | The tag used for the value when there are attributes in the element having no child. | -#### 2. Import type class instance +#### 4. Call API ```scala -import com.clairvoyant.data.scalaxy.writer.gcp.gcs.instances.DataFrameToXMLFileWriter -`````` - -#### 3. Call API - -```scala -DataFrameToGCSBucketWriter +DataFrameToGCSBucketWriter[XMLFileFormat] .write( dataFrame = df, fileFormat = xmlFileFormat, bucketName = myBucket, path = outputPath ) -`````` +``` ### PARQUET @@ -185,12 +228,26 @@ Suppose user wants to write the dataframe `df` to gcs bucket `myBucket` under th format. Then user need to perform below steps: -#### 1. Define file format +#### 1. Import type class + +```scala +import com.clairvoyant.data.scalaxy.writer.gcp.gcs.DataFrameToGCSBucketWriter +``` + +#### 2. Import type class instance + +```scala +import com.clairvoyant.data.scalaxy.writer.gcp.gcs.instances.DataFrameToParquetFileWriter +``` + +#### 3. Define file format ```scala import com.clairvoyant.data.scalaxy.writer.gcp.gcs.formats.ParquetFileFormat -val parquetFileFormat = ParquetFileFormat() +val parquetFileFormat = ParquetFileFormat( + mergeSchema = true +) ``` User can provide below options to the `ParquetFileFormat` instance: @@ -202,28 +259,50 @@ User can provide below options to the `ParquetFileFormat` instance: | mergeSchema | false | Sets whether we should merge schemas collected from all Parquet part-files. | | compression | snappy | Compression codec to use when saving to file. This can be one of the known case-insensitive shorten names (none, uncompressed, snappy, gzip, lzo, brotli, lz4, and zstd). | -#### 2. Import type class instance +#### 4. Call API ```scala -import com.clairvoyant.data.scalaxy.writer.gcp.gcs.instances.DataFrameToParquetFileWriter -`````` - -#### 3. Call API - -```scala -DataFrameToGCSBucketWriter +DataFrameToGCSBucketWriter[ParquetFileFormat] .write( dataFrame = df, fileFormat = parquetFileFormat, bucketName = myBucket, path = outputPath ) -`````` +``` ## BigQuery User can use this library to write/persist spark dataframe to google cloud BigQuery table. +### API + +The library provides below `write` method to write the dataframe to bigquery table: + +```scala +def write( + dataFrame: DataFrame, + table: String, + dataset: Option[String] = None, + project: Option[String] = None, + parentProject: Option[String] = None, + saveMode: SaveMode = SaveMode.Overwrite, + writerType: T + ): Unit +``` + +The `write` method takes below parameters: + +| Parameter Name | Mandatory | Default Value | Description | +|:---------------|:---------:|:-------------:|:--------------------------------------------------------------------------------------------------------------------------------------------| +| dataFrame | Yes | None | Spark dataframe to be written to gcs bucket. | +| table | Yes | None | The name of bigquery table where dataframe needs to be persisted. | +| dataset | No | None | The dataset containing the table. If you are providing fully qualified name in `table` parameter, then you can ignore this option. | +| project | No | None | The Google Cloud Project ID of the table.
(Optional. Defaults to the project of the Service Account being used) | +| parentProject | No | None | The Google Cloud Project ID of the table to bill for the export.
(Optional. Defaults to the project of the Service Account being used). | +| saveMode | No | Overwrite | Mode of writing; default is overwrite; can be avoided if writeDisposition/ createDisposition has been defined | +| writerType | Yes | None | The instance of direct or indirect big query writer type. | + There are two ways to write the dataframe to BigQuery table: * Direct Write @@ -238,7 +317,19 @@ Suppose user wants to write the dataframe `df` to the bigQuery table named `myBQ dataset `myBQDataset`. Then user need to perform below steps: -#### 1. Define BigQuery writer type +#### 1. Import type class + +```scala +import com.clairvoyant.data.scalaxy.writer.gcp.bigquery.DataFrameToBigQueryWriter +``` + +#### 2. Import type class instance + +```scala +import com.clairvoyant.data.scalaxy.writer.gcp.bigquery.instances.DataFrameToDirectBQWriter +``` + +#### 3. Define BigQuery writer type ```scala import com.clairvoyant.data.scalaxy.writer.gcp.bigquery.types.DirectBigQueryWriterType @@ -246,7 +337,7 @@ import com.clairvoyant.data.scalaxy.writer.gcp.bigquery.types.DirectBigQueryWrit val bigQueryWriterType = DirectBigQueryWriterType( createDisposition = "CREATE_IF_NEEDED" ) -`````` +``` Apart from `createDisposition`, user can pass below parameters to the `DirectBigQueryWriterType` instance: @@ -267,33 +358,37 @@ Apart from `createDisposition`, user can pass below parameters to the `DirectBig | queryJobPriority | INTERACTIVE | Priority levels set for the job while reading data from BigQuery query. The permitted values are:
BATCH - Query is queued and started as soon as idle resources are available, usually within a few minutes. If the query hasn't started within 3 hours, its priority is changed to INTERACTIVE.
INTERACTIVE - Query is executed as soon as possible and count towards the concurrent rate limit and the daily rate limit.
For WRITE, this option will be effective when DIRECT write is used with OVERWRITE mode, where the connector overwrites the destination table using MERGE statement. | | writeAtLeastOnce | false | Guarantees that data is written to BigQuery at least once. This is a lesser guarantee than exactly once. This is suitable for streaming scenarios in which data is continuously being written in small batches.
Supported only by the `DIRECT` write method and mode is NOT `Overwrite`. | -#### 2. Import type class instance - -```scala -import com.clairvoyant.data.scalaxy.writer.gcp.bigquery.instances.DataFrameToDirectBQWriter -`````` - -#### 3. Call API +#### 4. Call API ```scala -import com.clairvoyant.data.scalaxy.writer.gcp.bigquery.DataFrameToBigQueryWriter - -DataFrameToBigQueryWriter - .write[DirectBigQueryWriterType]( +DataFrameToBigQueryWriter[DirectBigQueryWriterType] + .write( dataFrame = df, table = myBQTable, dataset = myBQDataset, writerType = bigQueryWriterType ) -`````` +``` -### Direct Write +### Indirect Write Suppose user wants to write the dataframe `df` to the bigQuery table named `myBQTable` present under the dataset `myBQDataset`. Then user need to perform below steps: -#### 1. Define BigQuery writer type +#### 1. Import type class + +```scala +import com.clairvoyant.data.scalaxy.writer.gcp.bigquery.DataFrameToBigQueryWriter +``` + +#### 2. Import type class instance + +```scala +import com.clairvoyant.data.scalaxy.writer.gcp.bigquery.instances.DataFrameToIndirectBQWriter +``` + +#### 3. Define BigQuery writer type ```scala import com.clairvoyant.data.scalaxy.writer.gcp.bigquery.types.IndirectBigQueryWriterType @@ -301,7 +396,7 @@ import com.clairvoyant.data.scalaxy.writer.gcp.bigquery.types.IndirectBigQueryWr val bigQueryWriterType = IndirectBigQueryWriterType( createDisposition = "CREATE_IF_NEEDED" ) -`````` +``` Apart from `createDisposition`, user can pass below parameters to the `DirectBigQueryWriterType` instance: @@ -330,37 +425,17 @@ Apart from `createDisposition`, user can pass below parameters to the `DirectBig | temporaryGcsBucket | None | The GCS bucket that temporarily holds the data before it is loaded to BigQuery. Required unless set in the Spark configuration (spark.conf.set(...)). | | useAvroLogicalTypes | false | When loading from Avro (`.option("intermediateFormat", "avro")`), BigQuery uses the underlying Avro types instead of the logical types [by default]. Supplying this option converts Avro logical types to their corresponding BigQuery data types. | -#### 2. Import type class instance - -```scala -import com.clairvoyant.data.scalaxy.writer.gcp.bigquery.instances.DataFrameToIndirectBQWriter -`````` - -#### 3. Call API +#### 4. Call API ```scala -import com.clairvoyant.data.scalaxy.writer.gcp.bigquery.DataFrameToBigQueryWriter - -DataFrameToBigQueryWriter - .write[IndirectBigQueryWriterType]( +DataFrameToBigQueryWriter[IndirectBigQueryWriterType] + .write( dataFrame = df, table = myBQTable, dataset = myBQDataset, writerType = bigQueryWriterType ) -`````` - -User can provide below parameters to the `write` method: - -| Parameter Name | Mandatory | Default Value | Description | -|:---------------|:---------:|:-------------:|:--------------------------------------------------------------------------------------------------------------------------------------------| -| dataFrame | Yes | None | Spark dataframe to be written to BigQuery table. | -| table | Yes | None | The name of big query table where dataframe needs to be persisted. | -| dataset | No | None | The dataset containing the table. If you are providing fully qualified name in `table` parameter, then you can ignore this option. | -| project | No | None | The Google Cloud Project ID of the table.
(Optional. Defaults to the project of the Service Account being used) | -| parentProject | No | None | The Google Cloud Project ID of the table to bill for the export.
(Optional. Defaults to the project of the Service Account being used). | -| saveMode | No | Overwrite | Mode of writing; default is overwrite; can be avoided if writeDisposition/ createDisposition has been defined | -| writerType | Yes | None | The instance of direct or indirect big query writer type. | +``` Also, note that for writing to the BigQuery it is necessary to have below privileges to the user: diff --git a/local-file-system/README.md b/local-file-system/README.md index 6b474f9..38d04ab 100644 --- a/local-file-system/README.md +++ b/local-file-system/README.md @@ -12,19 +12,60 @@ ThisBuild / credentials += Credentials( System.getenv("GITHUB_TOKEN") ) -ThisBuild / libraryDependencies += "com.clairvoyant.data.scalaxy" %% "writer-local-file-system" % "1.0.0" +ThisBuild / libraryDependencies += "com.clairvoyant.data.scalaxy" %% "writer-local-file-system" % "2.0.0" ``` Make sure you add `GITHUB_USERNAME` and `GITHUB_TOKEN` to the environment variables. `GITHUB_TOKEN` is the Personal Access Token with the permission to read packages. +### API + +The API is very simple. User need to call the `write` method of the `DataFrameToLocalFileSystemWriter` object. + +```scala +def write( + dataFrame: DataFrame, + fileFormat: T, + path: String, + saveMode: SaveMode = SaveMode.Overwrite + ): Unit +``` + +The `write` method takes below parameters: + +| Parameter Name | Mandatory | Default Value | Description | +|:---------------|:---------:|:-------------:|:-------------------------------------------------| +| dataFrame | Yes | | The dataframe to write to the local file system. | +| fileFormat | Yes | | The file format to use to write the dataframe. | +| path | Yes | | The path to write the dataframe. | +| saveMode | No | Overwrite | The save mode to use to write the dataframe. | + +Supported file formats are: + +* CSV +* JSON +* XML +* Parquet + ### CSV Suppose user wants to write the dataframe `df` to the local file system under the path `outputPath` in the `csv` format. Then user need to perform below steps: -#### 1. Define file format +#### 1. Import type class + +```scala +import com.clairvoyant.data.scalaxy.writer.local.file.DataFrameToLocalFileSystemWriter +``` + +#### 2. Import type class instance + +```scala +import com.clairvoyant.data.scalaxy.writer.local.file.instances.DataFrameToCSVFileWriter +``` + +#### 3. Define file format ```scala import com.clairvoyant.data.scalaxy.writer.local.file.formats.CSVFileFormat @@ -37,7 +78,7 @@ val csvFileFormat = CSVFileFormat( User can provide below options to the `CSVFileFormat` instance: | Parameter Name | Default Value | Description | -| :------------------------ | :-------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +|:--------------------------|:---------------------------:|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | charToEscapeQuoteEscaping | \ | Sets a single character used for escaping the escape for the quote character. | | compression | none | Compression codec to use when saving to file. This can be one of the known case-insensitive shorten names (none, bzip2, gzip, lz4, snappy and deflate). | | dateFormat | yyyy-MM-dd | Sets the string that indicates a date format. | @@ -56,42 +97,49 @@ User can provide below options to the `CSVFileFormat` instance: | timestampFormat | yyyy-MM-dd HH:mm:ss | Sets the string that indicates a timestamp format. | | timestampNTZFormat | yyyy-MM-dd'T'HH:mm:ss[.SSS] | Sets the string that indicates a timestamp without timezone format. | -#### 2. Import type class instance +#### 4. Call API ```scala -import com.clairvoyant.data.scalaxy.writer.local.file.instances.DataFrameToCSVFileWriter -`````` +DataFrameToLocalFileSystemWriter[CSVFileFormat] + .write( + dataFrame = df, + fileFormat = csvFileFormat, + path = outputPath + ) +``` -#### 3. Call API +### JSON + +Suppose user wants to write the dataframe `df` to the local file system under the path `outputPath` in the `json` +format. +Then user need to perform below steps: + +#### 1. Import type class ```scala -DataFrameToLocalFileSystemWriter - .write( - dataFrame = df, - fileFormat = csvFileFormat, - path = outputPath - ) -`````` +import com.clairvoyant.data.scalaxy.writer.local.file.DataFrameToLocalFileSystemWriter +``` -### JSON +#### 2. Import type class instance -Suppose user wants to write the dataframe `df` to the local file system under the path `outputPath` in the `json` format. -Then user need to perform below steps: +```scala +import com.clairvoyant.data.scalaxy.writer.local.file.instances.DataFrameToJSONFileWriter +``` -#### 1. Define file format +#### 3. Define file format ```scala import com.clairvoyant.data.scalaxy.writer.local.file.formats.JSONFileFormat val jsonFileFormat = JSONFileFormat( - ignoreNullFields = true - ) + ignoreNullFields = true +) ``` User can provide below options to the `JSONFileFormat` instance: | Parameter Name | Default Value | Description | -| :----------------- | :-------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------ | +|:-------------------|:---------------------------:|:--------------------------------------------------------------------------------------------------------------------------------------------------------| | compression | none | Compression codec to use when saving to file. This can be one of the known case-insensitive shorten names (none, bzip2, gzip, lz4, snappy and deflate). | | dateFormat | yyyy-MM-dd | Sets the string that indicates a date format. | | encoding | UTF-8 | Specifies encoding (charset) of saved CSV files. | @@ -101,42 +149,48 @@ User can provide below options to the `JSONFileFormat` instance: | timestampNTZFormat | yyyy-MM-dd'T'HH:mm:ss[.SSS] | Sets the string that indicates a timestamp without timezone format. | | timezone | UTC | Sets the string that indicates a time zone ID to be used to format timestamps in the JSON datasources or partition values. | -#### 2. Import type class instance - -```scala -import com.clairvoyant.data.scalaxy.writer.local.file.instances.DataFrameToJSONFileWriter -`````` - -#### 3. Call API +#### 4. Call API ```scala -DataFrameToLocalFileSystemWriter - .write( - dataFrame = df, - fileFormat = jsonFileFormat, - path = outputPath - ) -`````` +DataFrameToLocalFileSystemWriter[JSONFileFormat] + .write( + dataFrame = df, + fileFormat = jsonFileFormat, + path = outputPath + ) +``` ### XML Suppose user wants to write the dataframe `df` to the local file system under the path `outputPath` in the `xml` format. Then user need to perform below steps: -#### 1. Define file format +#### 1. Import type class + +```scala +import com.clairvoyant.data.scalaxy.writer.local.file.DataFrameToLocalFileSystemWriter +``` + +#### 2. Import type class instance + +```scala +import com.clairvoyant.data.scalaxy.writer.local.file.instances.DataFrameToXMLFileWriter +``` + +#### 3. Define file format ```scala import com.clairvoyant.data.scalaxy.writer.local.file.formats.XMLFileFormat val xmlFileFormat = XMLFileFormat( - attributePrefix = "attr_" - ) + attributePrefix = "attr_" +) ``` User can provide below options to the `XMLFileFormat` instance: | Parameter Name | Default Value | Description | -| :--------------- | :---------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +|:-----------------|:-----------------------------------------------:|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | arrayElementName | item | Name of XML element that encloses each element of an array-valued column when writing. | | attributePrefix | _ | The prefix for attributes so that we can differentiating attributes and elements. This will be the prefix for field names. | | compression | None | Compression codec to use when saving to file.
Should be the fully qualified name of a class implementing org.apache.hadoop.io.compress.CompressionCodec or one of case-insensitive shorten names (bzip2, gzip, lz4, and snappy).
Defaults to no compression when a codec is not specified. | @@ -148,29 +202,36 @@ User can provide below options to the `XMLFileFormat` instance: | timestampFormat | yyyy-MM-dd'T'HH:mm:ss[.SSS][XXX] | Controls the format used to write TimestampType format columns. | | valueTag | _VALUE | The tag used for the value when there are attributes in the element having no child. | -#### 2. Import type class instance +#### 4. Call API ```scala -import com.clairvoyant.data.scalaxy.writer.local.file.instances.DataFrameToXMLFileWriter -`````` +DataFrameToLocalFileSystemWriter[XMLFileFormat] + .write( + dataFrame = df, + fileFormat = xmlFileFormat, + path = outputPath + ) +``` + +### PARQUET + +Suppose user wants to write the dataframe `df` to the local file system under the path `outputPath` in the `parquet` +format. +Then user need to perform below steps: -#### 3. Call API +#### 1. Import type class ```scala -DataFrameToLocalFileSystemWriter - .write( - dataFrame = df, - fileFormat = xmlFileFormat, - path = outputPath - ) -`````` +import com.clairvoyant.data.scalaxy.writer.local.file.DataFrameToLocalFileSystemWriter +``` -### PARQUET +#### 2. Import type class instance -Suppose user wants to write the dataframe `df` to the local file system under the path `outputPath` in the `parquet` format. -Then user need to perform below steps: +```scala +import com.clairvoyant.data.scalaxy.writer.local.file.instances.DataFrameToParquetFileWriter +``` -#### 1. Define file format +#### 3. Define file format ```scala import com.clairvoyant.data.scalaxy.writer.local.file.formats.ParquetFileFormat @@ -181,25 +242,19 @@ val parquetFileFormat = ParquetFileFormat() User can provide below options to the `ParquetFileFormat` instance: | Parameter Name | Default Value | Description | -| :----------------- | :-----------: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +|:-------------------|:-------------:|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | datetimeRebaseMode | EXCEPTION | The datetimeRebaseMode option allows to specify the rebasing mode for the values of the DATE, TIMESTAMP_MILLIS, TIMESTAMP_MICROS logical types from the Julian to Proleptic Gregorian calendar.
Currently supported modes are:
EXCEPTION: fails in reads of ancient dates/timestamps that are ambiguous between the two calendars.
CORRECTED: loads dates/timestamps without rebasing.
LEGACY: performs rebasing of ancient dates/timestamps from the Julian to Proleptic Gregorian calendar. | | int96RebaseMode | EXCEPTION | The int96RebaseMode option allows to specify the rebasing mode for INT96 timestamps from the Julian to Proleptic Gregorian calendar. Currently supported modes are:
EXCEPTION: fails in reads of ancient INT96 timestamps that are ambiguous between the two calendars.
CORRECTED: loads INT96 timestamps without rebasing.
LEGACY: performs rebasing of ancient timestamps from the Julian to Proleptic Gregorian calendar. | | mergeSchema | false | Sets whether we should merge schemas collected from all Parquet part-files. | | compression | snappy | Compression codec to use when saving to file. This can be one of the known case-insensitive shorten names (none, uncompressed, snappy, gzip, lzo, brotli, lz4, and zstd). | -#### 2. Import type class instance - -```scala -import com.clairvoyant.data.scalaxy.writer.local.file.instances.DataFrameToParquetFileWriter -`````` - -#### 3. Call API +#### 4. Call API ```scala -DataFrameToLocalFileSystemWriter - .write( - dataFrame = df, - fileFormat = parquetFileFormat, - path = outputPath - ) +DataFrameToLocalFileSystemWriter[ParquetFileFormat] + .write( + dataFrame = df, + fileFormat = parquetFileFormat, + path = outputPath + ) `````` \ No newline at end of file