enso-org · mergify · Sep 27, 2023 · Sep 25, 2023 · Sep 25, 2023 · Sep 26, 2023
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -578,6 +578,7 @@
 - [Renamed `Decimal` to `Float`.][7807]
 - [Implemented `Date_Time_Formatter` for more user-friendly date/time format
   parsing.][7826]
+- [Implemented `Table.auto_value_types` for in-memory tables.][7908]
 
 [debug-shortcuts]:
   https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug
@@ -820,6 +821,7 @@
 [7776]: https://github.com/enso-org/enso/pull/7776
 [7807]: https://github.com/enso-org/enso/pull/7807
 [7826]: https://github.com/enso-org/enso/pull/7826
+[7908]: https://github.com/enso-org/enso/pull/7908
 
 #### Enso Compiler
 

@@ -1,6 +1,4 @@
 import project.Any.Any
-import project.Data.Ordering.Comparable
-import project.Data.Ordering.Ordering
 import project.Nothing.Nothing
 from project.Data.Boolean.Boolean import False, True
 
@@ -98,4 +96,3 @@ type Boolean
              if (27 % 3) == 0 then IO.println "Fizz"
     if_then : Any -> Any | Nothing
     if_then self ~on_true = @Builtin_Method "Boolean.if_then"
-
@@ -1,4 +1,5 @@
 import project.Any.Any
+import project.Data.Ordering.Comparable
 import project.Data.Locale.Locale
 import project.Data.Text.Text
 import project.Error.Error
@@ -1169,3 +1170,39 @@ type Number_Parse_Error
     to_display_text : Text
     to_display_text self =
         "Could not parse " + self.text.to_text + " as a double."
+
+## A wrapper type that ensures that a function may only take positive integers.
+type Positive_Integer
+    ## PRIVATE
+       This constructor should not be used by user code as it can be used to
+       break the invariants. Instead, this type should only be created by `new`
+       or conversions.
+    Value (integer : Integer)
+
+    ## PRIVATE
+       ADVANCED
+       Constructor to create a `Positive_Integer` from an `Integer` - checking
+       if it satisfies the condition. User code should prefer the
+       `Positive_Integer.from` conversion.
+    new (integer : Integer) =
+        if integer > 0 then Positive_Integer.Value integer else
+            Error.throw (Illegal_Argument.Error "Expected a positive integer, but got "+integer.to_display_text)
+
+## Allows to create a `Positive_Integer` from an `Integer`.
+   It will throw `Illegal_Argument` if the provided integer is not positive.
+Positive_Integer.from (that : Integer) = Positive_Integer.new that
+
+## PRIVATE
+Integer.from (that : Positive_Integer) = that.integer
+
+## PRIVATE
+type Positive_Integer_Comparator
+    ## PRIVATE
+    compare x y =
+        Comparable.from x.integer . compare x.integer y.integer
+
+    ## PRIVATE
+    hash x = Comparable.from x.integer . hash x.integer
+
+## PRIVATE
+Comparable.from (_:Positive_Integer) = Positive_Integer_Comparator
@@ -1579,6 +1579,15 @@ type Column
         check_cast_compatibility self.value_type value_type <|
             self.internal_do_cast value_type on_problems
 
+    ## Change the value type of the column to a more specific one, based on its
+       contents.
+
+       This operation is currently not available in the Database backend.
+    auto_value_type : Boolean -> Column
+    auto_value_type self shrink_types=False =
+        _ = shrink_types
+        Error.throw <| Unsupported_Database_Operation.Error "`Column.auto_value_type` is not supported in the Database backends."
+
     ## PRIVATE
        Shares the core CAST logic between `cast` and `parse`.
     internal_do_cast : Value_Type -> Problem_Behavior -> Column

@@ -1979,6 +1979,15 @@ type Table
             new_column = column_to_cast.cast value_type on_problems
             table.set new_column new_name=column_to_cast.name set_mode=Set_Mode.Update
 
+    ## Change the value type of table columns to a more specific one, based on
+       their contents.
+
+       This operation is currently not available in the Database backend.
+    auto_value_types : Vector (Text | Integer | Regex) | Text | Integer | Regex -> Boolean -> Boolean -> Problem_Behavior -> Table
+    auto_value_types self columns=self.column_names shrink_types=False error_on_missing_columns=True on_problems=Problem_Behavior.Report_Warning =
+        _ = [columns, shrink_types, error_on_missing_columns, on_problems]
+        Error.throw (Unsupported_Database_Operation.Error "Table.auto_value_types is not supported in the Database backends.")
+
     ## ALIAS drop_missing_rows, dropna
        GROUP Standard.Base.Selections
        Remove rows which are all blank or containing blank values.

@@ -1762,6 +1762,43 @@ type Column
             on_problems.attach_problems_before problems <|
                 Column.from_storage self.name new_storage
 
+    ## Change the value type of the column to a more specific one, based on its
+       contents.
+
+       Arguments:
+       - shrink_types: If set `True`, smaller types will be chosen if possible,
+         according to the rules below. Defaults to `False`.
+
+       ? Auto Type Selection Rules
+
+         - If a `Mixed` column can be assigned a single type, like `Char` or
+           `Integer`, that will be used.
+         - Text columns are not parsed. To do that, use the `parse` method.
+         - If a `Float` column contains only integers, it will be converted to
+           an Integer column.
+         - If a `Decimal` column contains only integers that could fit in a
+           64-bit integer storage, it will be converted to an Integer column.
+         - If `shrink_types` is `False` (default), no other transformations are
+           applied.
+         - However, if `shrink_types` is set to `True`, then:
+           - Integer columns will be assigned the smallest size that can fit all
+             values (down to 16-bit integers; converting to the `Byte` type has
+             to be done manually through `cast`).
+           - If all elements in a text column have the same length, the type
+             will become fixed length.
+           - Otherwise, if a text column is variable length, but all text
+             elements are no longer than 255 characters, the column will get a
+             max length of 255. Otherwise, the column size limit will stay
+             unchanged.
+    auto_value_type : Boolean -> Column
+    auto_value_type self shrink_types=False =
+        new_value_type = case shrink_types of
+            False -> self.inferred_precise_value_type
+            True ->
+                Storage.to_value_type self.java_column.getStorage.inferPreciseTypeShrunk
+        # We run with Report_Error because we do not expect any problems.
+        self.cast new_value_type on_problems=Problem_Behavior.Report_Error
+
     ## ALIAS transform column
 
        Applies `function` to each item in this column and returns the column

@@ -88,10 +88,20 @@ type Table
                         Column.from_vector (v.at 0) (v.at 1) . java_column
                 Column.Value java_col -> java_col
                 _ -> invalid_input_shape
-        if cols.is_empty then Error.throw (Illegal_Argument.Error "Cannot create a table with no columns.") else
-            if (cols.all c-> c.getSize == cols.first.getSize).not then Error.throw (Illegal_Argument.Error "All columns must have the same row count.") else
-                if cols.distinct .getName . length != cols.length then Error.throw (Illegal_Argument.Error "Column names must be distinct.") else
-                    Table.Value (Java_Table.new cols)
+        Panic.recover Illegal_Argument <|
+            if cols.is_empty then
+                Panic.throw (Illegal_Argument.Error "Cannot create a table with no columns.")
+
+            if cols.distinct .getName . length != cols.length then
+                Panic.throw (Illegal_Argument.Error "Column names must be distinct.")
+
+            mismatched_size_column = cols.find if_missing=Nothing c->
+                c.getSize != cols.first.getSize
+            if mismatched_size_column.is_nothing.not then
+                msg = "All columns must have the same row count, but the column [" + mismatched_size_column.getName + "] has " + mismatched_size_column.getSize.to_text + " rows, while the column [" + cols.first.getName + "] has " + cols.first.getSize.to_text + " rows."
+                Panic.throw (Illegal_Argument.Error msg)
+
+            Table.Value (Java_Table.new cols)
 
     ## GROUP Standard.Base.Constants
        Creates a new table from a vector of column names and a vector of vectors
@@ -946,6 +956,9 @@ type Table
        Arguments:
        - columns: The selection of columns to cast.
        - value_type: The `Value_Type` to cast the column to.
+       - error_on_missing_columns: Specifies if a missing input column should
+         result in an error regardless of the `on_problems` settings. Defaults
+         to `True`.
        - on_problems: Specifies how to handle problems if they occur, reporting
          them as warnings by default.
 
@@ -996,6 +1009,50 @@ type Table
             new_column = column_to_cast.cast value_type on_problems
             table.set new_column new_name=column_to_cast.name set_mode=Set_Mode.Update
 
+    ## Change the value type of table columns to a more specific one, based on
+       their contents.
+
+       This is most useful for `Mixed` type columns and will allow to narrow
+       down the type if all values in the column fit a more specific type.
+
+       Arguments:
+       - columns: The selection of columns to convert.
+       - shrink_types: If set `True`, smaller types will be chosen if possible,
+         according to the rules below. Defaults to `False`.
+       - error_on_missing_columns: Specifies if a missing input column should
+         result in an error regardless of the `on_problems` settings. Defaults
+         to `True`.
+       - on_problems: Specifies how to handle problems if they occur, reporting
+         them as warnings by default.
+
+       ? Auto Type Selection Rules
+
+         - If a `Mixed` column can be assigned a single type, like `Char` or
+           `Integer`, that will be used.
+         - Text columns are not parsed. To do that, use the `parse` method.
+         - If a `Float` column contains only integers, it will be converted to
+           an Integer column.
+         - If a `Decimal` column contains only integers that could fit in a
+           64-bit integer storage, it will be converted to an Integer column.
+         - If `shrink_types` is `False` (default), no other transformations are
+           applied.
+         - However, if `shrink_types` is set to `True`, then:
+           - Integer columns will be assigned the smallest size that can fit all
+             values (down to 16-bit integers; converting to the `Byte` type has
+             to be done manually through `cast`).
+           - If all elements in a text column have the same length, the type
+             will become fixed length.
+           - Otherwise, if a text column is variable length, but all text
+             elements are no longer than 255 characters, the column will get a
+             max length of 255. Otherwise, the column size limit will stay
+             unchanged.
+    auto_value_types : Vector (Text | Integer | Regex) | Text | Integer | Regex -> Boolean -> Boolean -> Problem_Behavior -> Table
+    auto_value_types self columns=self.column_names shrink_types=False error_on_missing_columns=True on_problems=Problem_Behavior.Report_Warning =
+        selected = self.columns_helper.select_columns columns Case_Sensitivity.Default reorder=False error_on_missing_columns=error_on_missing_columns on_problems=on_problems error_on_empty=False
+        selected.fold self table-> column_to_cast->
+            new_column = column_to_cast.auto_value_type shrink_types
+            table.set new_column new_name=column_to_cast.name set_mode=Set_Mode.Update
+
     ## GROUP Standard.Base.Conversions
        Splits a column of text into a set of new columns.
        The original column will be removed from the table.

@@ -18,7 +18,7 @@ polyglot java import org.enso.table.data.column.storage.type.IntegerType
 most_specific_value_type : Any -> Boolean -> Value_Type
 most_specific_value_type value use_smallest=False =
     case value of
-        _ : Float     -> Value_Type.Float Bits.Bits_64
+        _ : Float       -> Value_Type.Float Bits.Bits_64
         _ : Boolean     -> Value_Type.Boolean
         _ : Date        -> Value_Type.Date
         _ : Time_Of_Day -> Value_Type.Time
@@ -33,9 +33,12 @@ most_specific_value_type value use_smallest=False =
                         # We do a small rewrite here - for integers we always return the Integer type, even if the value is small enough to fit in a Byte.
                         if value_type == Value_Type.Byte then Value_Type.Integer Bits.Bits_16 else value_type
                 True -> Value_Type.Decimal precision=Nothing scale=0
-        text : Text     -> case use_smallest of
-            False -> Value_Type.Char size=Nothing variable_length=True
-            True  -> Value_Type.Char size=text.length variable_length=False
+        text : Text     ->
+            length = text.length
+            # Not using Char size=0 for empty strings, because that would be an invalid value.
+            case use_smallest && length > 0 of
+                True  -> Value_Type.Char size=text.length variable_length=False
+                False -> Value_Type.Char size=Nothing variable_length=True
         ## TODO [RW] once we add Enso Native Object Type Value Type, we probably
            want to prefer it over Mixed
         _               -> Value_Type.Mixed
@@ -54,7 +54,7 @@ closest_storage_type value_type = case value_type of
         Error.throw (Illegal_Argument.Error "Value_Type.Char with fixed length must have a non-nothing size")
     Value_Type.Char max_length variable_length ->
         fixed_length = variable_length.not
-        TextType.new max_length fixed_length
+        TextType.new (max_length : Integer) fixed_length
     Value_Type.Date -> DateType.INSTANCE
     # We currently will not support storing dates without timezones in in-memory mode.
     Value_Type.Date_Time _ -> DateTimeType.INSTANCE

@@ -1,4 +1,5 @@
 from Standard.Base import all
+import Standard.Base.Data.Numbers.Positive_Integer
 import Standard.Base.Errors.Illegal_Argument.Illegal_Argument
 
 import project.Data.Type.Value_Type_Helpers
@@ -95,12 +96,22 @@ type Value_Type
 
        ANSI SQL: CHAR, VARCHAR, TEXT, LONGVARCHAR, NCHAR, NVARCHAR, TEXT, CLOB, NCLOB
 
+       ! Counting Characters
+
+         Note that different backends may count the text in different ways.
+         The in-memory backend treats a single grapheme cluster (e.g. 💡) as a
+         single character unit. In most database systems more complex grapheme
+         clusters may be counted as multiple characters. So there isn't a 1-1
+         correspondence between these limits across backends which may cause
+         strings to be truncated if they contain such characters and are close
+         to the limit.
+
        Arguments:
        - size: the maximum number of characters that can be stored in the
-         column. It can be nothing to indicate no limit.
+         column. It can be nothing to indicate no limit. It cannot be 0.
        - variable_length: whether the size is a maximum or a fixed length.
          A fixed length string must have a non-nothing size.
-    Char size:(Integer|Nothing)=Nothing variable_length:Boolean=True
+    Char (size : (Positive_Integer | Nothing) = Nothing) variable_length:Boolean=True
 
     ## Date
 
@@ -383,15 +394,23 @@ type Value_Type
         Value_Type.Integer size -> "Integer (" + size.to_text + ")"
         Value_Type.Float size -> "Float (" + size.to_text + ")"
         Value_Type.Decimal precision scale -> "Decimal (precision=" + precision.to_text + ", scale=" + scale.to_text + ")"
-        Value_Type.Char size variable_length -> case variable_length of
-            True  -> "Char (variable length, max_size=" + size.to_text + ")"
-            False -> "Char (fixed length, size=" + size.to_text + ")"
+        Value_Type.Char size variable_length ->
+            size_text = case size of
+                Nothing -> "unlimited"
+                _ -> size.to Integer . to_text
+            case variable_length of
+                True  -> "Char (variable length, max_size=" + size_text + ")"
+                False -> "Char (fixed length, size=" + size_text + ")"
         Value_Type.Date -> "Date"
         Value_Type.Date_Time with_timezone -> "Date_Time (with_timezone=" + with_timezone.to_text + ")"
         Value_Type.Time -> "Time"
-        Value_Type.Binary size variable_length -> case variable_length of
-            True  -> "Binary (variable length, max_size=" + size.to_text + " bytes)"
-            False -> "Binary (fixed length, size=" + size.to_text + " bytes)"
+        Value_Type.Binary size variable_length ->
+            size_text = case size of
+                Nothing -> "unlimited"
+                _ -> size.to Integer . to_text + " bytes"
+            case variable_length of
+                True  -> "Binary (variable length, max_size=" + size_text + ")"
+                False -> "Binary (fixed length, size=" + size_text + ")"
         Value_Type.Unsupported_Data_Type type_name _ -> case type_name of
             Nothing  -> "Unsupported_Data_Type"
             _ : Text -> "Unsupported_Data_Type (" + type_name + ")"

@@ -10,6 +10,7 @@
 import org.enso.table.data.column.storage.type.FloatType;
 import org.enso.table.data.column.storage.type.StorageType;
 import org.enso.table.error.ValueTypeMismatchException;
+import org.graalvm.polyglot.Context;
 
 // For now the BigInteger builder is just a stub, reusing the ObjectBuilder and adding a warning.
 public class BigIntegerBuilder extends TypedBuilderImpl<BigInteger> {
@@ -88,10 +89,12 @@ public void appendRawNoGrow(BigInteger value) {
   }
 
   public static BigIntegerBuilder retypeFromLongBuilder(LongBuilder longBuilder) {
+    BigIntegerBuilder res = new BigIntegerBuilder(longBuilder.data.length);
     int n = longBuilder.currentSize;
-    BigIntegerBuilder res = new BigIntegerBuilder(n);
+    Context context = Context.getCurrent();
     for (int i = 0; i < n; i++) {
       res.appendNoGrow(BigInteger.valueOf(longBuilder.data[i]));
+      context.safepoint();
     }
     return res;
   }

@@ -26,10 +26,10 @@ public void appendNoGrow(Object o) {
     if (o == null) {
       isMissing.set(currentSize++);
     } else {
-      try {
-        long x = NumericConverter.coerceToLong(o);
+      Long x = NumericConverter.tryConvertingToLong(o);
+      if (x != null) {
         appendLongNoGrow(x);
-      } catch (UnsupportedOperationException e) {
+      } else {
         throw new ValueTypeMismatchException(type, o);
       }
     }

@@ -19,10 +19,10 @@ public void appendNoGrow(Object o) {
     if (o == null) {
       isMissing.set(currentSize++);
     } else {
-      try {
-        long x = NumericConverter.coerceToLong(o);
-        data[currentSize++] = x;
-      } catch (UnsupportedOperationException e) {
+      Long x = NumericConverter.tryConvertingToLong(o);
+      if (x != null) {
+        appendLongNoGrow(x);
+      } else {
         throw new ValueTypeMismatchException(getType(), o);
       }
     }