AbsaOSS · benedeki · Jul 27, 2019 · Jul 9, 2019 · Jul 17, 2019 · Jul 17, 2019
@@ -223,44 +223,52 @@ To enable processing of time entries from other systems **Standardization** offe
 string and even numeric values to timestamp or date types. It's done using Spark's ability to convert strings to 
 timestamp/date with some enhancements. The pattern placeholders and usage is described in Java's 
 [`SimpleDateFormat` class description](https://docs.oracle.com/javase/8/docs/api/java/text/SimpleDateFormat.html) with 
-the addition of recognizing two keywords `epoch` and `milliepoch` (case insensitive) to denote the number of 
-seconds/milliseconds since epoch (1970/01/01 00:00:00.000 UTC).
+the addition of recognizing some keywords (like `epoch` and `milliepoch` (case insensitive)) to denote the number of 
+seconds/milliseconds since epoch (1970/01/01 00:00:00.000 UTC) and some additional placeholders.
 It should be noted explicitly that `epoch` and `milliepoch` are considered a pattern including time zone.
 
 Summary:
 
-| placeholder | Description | Example |
-| --- | --- | --- |
-| G | Era designator | AD |
-| y | Year | 1996; 96 |
-| Y | Week year | 2009; 09 |
-| M | Month in year (context sensitive) |  July; Jul; 07 |
-| L | Month in year (standalone form) | July; Jul; 07 |
-| w | Week in year | 27 |
-| W | Week in month | 2 |
-| D | Day in year | 189 |
-| d | Day in month |  10 |
-| F | Day of week in month | 2 |
-| E | Day name in week | Tuesday; Tue |
-| u | Day number of week (1 = Monday, ..., 7 = Sunday) | 1 |
-| a | Am/pm marker | PM |
-| H | Hour in day (0-23) | 0 |
-| k | Hour in day (1-24) | 24 |
-| K | Hour in am/pm (0-11) |  0 |
-| h | Hour in am/pm (1-12) | 12 |
-| m | Minute in hour | 30 |
-| s | Second in minute | 55 |
-| S | Millisecond | 978 |
-| z | General time zone | Pacific Standard Time; PST; GMT-08:00 |
-| Z | RFC 822 time zone | -0800 |
-| X | ISO 8601 time zone | -08; -0800; -08:00 |
-| _epoch_ | Seconds since 1970/01/01 00:00:00 | 1557136493|
-| _milliepoch_ | Milliseconds since 1970/01/01 00:00:00.0000| 15571364938124 |
+| placeholder | Description | Example | Note |
+| --- | --- | --- | --- |
+| G | Era designator | AD | |
+| y | Year | 1996; 96 | |
+| Y | Week year | 2009; 09 | |
+| M | Month in year (context sensitive) |  July; Jul; 07 | |
+| L | Month in year (standalone form) | July; Jul; 07 | |
+| w | Week in year | 27 | |
+| W | Week in month | 2 | |
+| D | Day in year | 189 | |
+| d | Day in month |  10 | |
+| F | Day of week in month | 2 | |
+| E | Day name in week | Tuesday; Tue | |
+| u | Day number of week (1 = Monday, ..., 7 = Sunday) | 1 | |
+| a | Am/pm marker | PM | |
+| H | Hour in day (0-23) | 0 | |
+| k | Hour in day (1-24) | 24 | |
+| K | Hour in am/pm (0-11) |  0 | |
+| h | Hour in am/pm (1-12) | 12 | |
+| m | Minute in hour | 30 | |
+| s | Second in minute | 55 | |
+| S | Millisecond | 978 | |
+| z | General time zone | Pacific Standard Time; PST; GMT-08:00 | |
+| Z | RFC 822 time zone | -0800 | |
+| X | ISO 8601 time zone | -08; -0800; -08:00 | |
+| _epoch_ | Seconds since 1970/01/01 00:00:00 | 1557136493, 1557136493.136| |
+| _epochmilli_ | Milliseconds since 1970/01/01 00:00:00.0000| 1557136493128, 1557136493128.001 | |
+| _epochmicro_ | Microseconds since 1970/01/01 00:00:00.0000| 1557136493128789, 1557136493128789.999 | |
+| _epochnano_ | Nanoseconds since 1970/01/01 00:00:00.0000| 1557136493128789101 | Seen the remark bellow regarding the loss of precision in _nanoseconds_ |
+| i | Microsecond | 111, 321001 | |
+| n | Nanosecond | 999, 542113879 | Seen the remark bellow regarding the loss of precision in _nanoseconds_ |
+
 
 **NB!** Spark uses US Locale and because on-the-fly conversion would be complicated, at the moment we stick to this 
 hardcoded locale as well. E.g. `am/pm` for `a` placeholder, English names of days and months etc.
 
 **NB!** The keywords are case **insensitive**. Therefore, there is no difference between `epoch` and `EpoCH`.
+
+**NB!** While _nanoseconds_ designation is supported on input, it's not supported in storage or further usage. So any
+value behind microseconds precision will be truncated.
 
 ##### Time Zone support
 As it has been mentioned, it's highly recommended to use timestamps with the time zone. But it's not unlikely that the 

@@ -61,6 +61,17 @@ sealed trait TypeParser {
 }
 
 object TypeParser extends StandardizationCommon {
+  import za.co.absa.enceladus.utils.implicits.ColumnImplicits.ColumnEnhancements
+
+  // scalastyle:off magic.number
+  private val decimalType = DecimalType(30,9)
+  // scalastyle:on magic.number
+
+  private val MillisecondsPerSecond = 1000
+  private val MicrosecondsPerSecond = 1000000
+  private val NanosecondsPerSecond  = 1000000000
+
+
   def standardize(field: StructField, path: String, origSchema: StructType)
                  (implicit udfLib: UDFLibrary): ParseOutput = {
     // udfLib implicit is present for error column UDF implementation
@@ -266,8 +277,6 @@ object TypeParser extends StandardizationCommon {
     override protected def assemblePrimitiveCastErrorLogic(castedCol: Column): Column = {
       //NB! loss of precision is not addressed for any fractional type
 
-      import za.co.absa.enceladus.utils.implicits.ColumnImplicits.ColumnEnhancements
-
       if (allowInfinity) {
         castedCol.isNull or castedCol.isNaN
       } else {
@@ -309,7 +318,6 @@ object TypeParser extends StandardizationCommon {
     * Other         | ->String->to_date               | ->String->to_timestamp->to_utc_timestamp->to_date
     */
   private trait DateTimeParser extends PrimitiveParser {
-    protected val basicCastFunction: (Column, String) => Column  //for epoch casting
     protected val pattern: DateTimePattern = DateTimePattern.fromStructField(field)
 
     override protected def assemblePrimitiveCastLogic: Column = {
@@ -328,11 +336,6 @@ object TypeParser extends StandardizationCommon {
       }
     }
 
-    private def castEpoch(): Column = {
-      val epochPattern: String = Defaults.getGlobalFormat(field.dataType)
-      basicCastFunction(from_unixtime(column.cast(LongType)  / pattern.epochFactor, epochPattern), epochPattern)
-    }
-
     private def castWithPattern(): Column = {
       // sadly with parquet support, incoming might not be all `plain`
       origType match {
@@ -370,6 +373,10 @@ object TypeParser extends StandardizationCommon {
       castStringColumn(nonStringColumn.cast(StringType))
     }
 
+    protected def castEpoch(): Column = {
+      (column.cast(decimalType) / pattern.epochFactor).cast(TimestampType)
+    }
+
     protected def castStringColumn(stringColumn: Column): Column
 
     protected def castDateColumn(dateColumn: Column): Column
@@ -382,16 +389,31 @@ object TypeParser extends StandardizationCommon {
                                       path: String,
                                       origSchema: StructType,
                                       parent: Option[Parent]) extends DateTimeParser {
-    protected val basicCastFunction: (Column, String) => Column = to_date //for epoch casting
-
-    override protected def castStringColumn(stringColumn: Column): Column = {
-      pattern.defaultTimeZone.map(tz =>
-        to_date(to_utc_timestamp(to_timestamp(stringColumn, pattern), tz))
+    private def applyPatternToStringColumn(column: Column, pattern: String, defaultTimeZone: Option[String]): Column = {
+      defaultTimeZone.map(tz =>
+        to_date(to_utc_timestamp(to_timestamp(column, pattern), tz))
       ).getOrElse(
-        to_date(stringColumn, pattern)
+        to_date(column, pattern)
       )
     }
 
+    override def castEpoch(): Column = {
+      // number cannot be cast to date directly, so first casting to timestamp and then truncating
+      to_date(super.castEpoch())
+    }
+
+    override protected def castStringColumn(stringColumn: Column): Column = {
+      if (pattern.containsSecondFractions) {
+        // date doesn't need to care about second fractions
+        applyPatternToStringColumn(
+          stringColumn.removeSections(
+            Seq(pattern.millisecondsPosition, pattern.microsecondsPosition, pattern.nanosecondsPosition).flatten
+          ), pattern.patternWithoutSecondFractions, pattern.defaultTimeZone)
+      } else {
+        applyPatternToStringColumn(stringColumn, pattern, pattern.defaultTimeZone)
+      }
+    }
+
     override protected def castDateColumn(dateColumn: Column): Column = {
       pattern.defaultTimeZone.map(
         tz => to_date(to_utc_timestamp(dateColumn, tz))
@@ -413,11 +435,36 @@ object TypeParser extends StandardizationCommon {
                                            path: String,
                                            origSchema: StructType,
                                            parent: Option[Parent]) extends DateTimeParser {
-    protected val basicCastFunction: (Column, String) => Column = to_timestamp //for epoch casting
+
+    private def applyPatternToStringColumn(column: Column, pattern: String, defaultTimeZone: Option[String]): Column = {
+      val interim: Column = to_timestamp(column, pattern)
+      defaultTimeZone.map(to_utc_timestamp(interim, _)).getOrElse(interim)
+    }
 
     override protected def castStringColumn(stringColumn: Column): Column = {
-      val interim: Column = to_timestamp(stringColumn, pattern)
-      pattern.defaultTimeZone.map(to_utc_timestamp(interim, _)).getOrElse(interim)
+      if (pattern.containsSecondFractions) {
+        //this is a trick how to enforce fractions of seconds into the timestamp
+        // - turn into timestamp up to seconds precision and that into unix_timestamp,
+        // - the second fractions turn into numeric fractions
+        // - add both together and convert to timestamp
+        val colSeconds = unix_timestamp(applyPatternToStringColumn(
+          stringColumn.removeSections(
+            Seq(pattern.millisecondsPosition, pattern.microsecondsPosition, pattern.nanosecondsPosition).flatten
+          ), pattern.patternWithoutSecondFractions, pattern.defaultTimeZone))
+
+        val colMilliseconds: Option[Column] =
+          pattern.millisecondsPosition.map(stringColumn.zeroBasedSubstr(_).cast(decimalType) / MillisecondsPerSecond)
+        val colMicroseconds: Option[Column] =
+          pattern.microsecondsPosition.map(stringColumn.zeroBasedSubstr(_).cast(decimalType) / MicrosecondsPerSecond)
+        val colNanoseconds: Option[Column] =
+          pattern.nanosecondsPosition.map(stringColumn.zeroBasedSubstr(_).cast(decimalType) / NanosecondsPerSecond)
+        val colFractions: Column =
+          (colMilliseconds ++ colMicroseconds ++ colNanoseconds).reduceOption(_ + _).getOrElse(lit(0))
+
+        (colSeconds + colFractions).cast(TimestampType)
+      } else {
+        applyPatternToStringColumn(stringColumn, pattern, pattern.defaultTimeZone)
+      }
     }
 
     override protected def castDateColumn(dateColumn: Column): Column = {