diff --git a/gcf/custom/manifest.go b/gcf/custom/manifest.go index cbfd58b..250500e 100644 --- a/gcf/custom/manifest.go +++ b/gcf/custom/manifest.go @@ -16,15 +16,46 @@ package custom import ( "context" + "fmt" "path" "path/filepath" "sort" + "strings" pb "github.com/datacommonsorg/tools/gcf/proto" "google.golang.org/protobuf/encoding/protojson" "google.golang.org/protobuf/proto" ) +func isAlphaNumeric(c byte) bool { + return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') +} + +func getImportGroupName(root string) (string, error) { + importGroup := path.Base(root) + + // Remove alphanumeric characters since they aren't handled well by GCS. + var builder strings.Builder + for i := 0; i < len(importGroup); i++ { + if isAlphaNumeric(importGroup[i]) { + builder.WriteByte(importGroup[i]) + } + } + result := builder.String() + if len(result) == 0 { + return "", fmt.Errorf("importGroup contains no alphanumeric characters: %s", root) + } + + // IMPORTANT NOTE: + // importGroup has a character constraint of 20 due to internal limitations. + // https://cloud.google.com/storage-transfer/docs/known-limitations-transfer#tsop-max-path + if len(result) > 20 { + result = result[:20] + } + + return result, nil +} + func computeTable(bucket, root, im, tab string, table *Table) *pb.ExternalTable { bigstoreTmcf := filepath.Join("/bigstore", bucket, root, "data", im, tab, table.tmcf) bigstoreCsv := []string{} @@ -107,11 +138,9 @@ func ComputeManifest( layout *Layout, ) (*pb.DataCommonsManifest, error) { root := layout.root - importGroup := path.Base(root) - // IMPORTANT NOTE: - // importGroup has a character constraint of 20 due to internal limitations. - if len(importGroup) > 20 { - importGroup = importGroup[:20] + importGroup, err := getImportGroupName(root) + if err != nil { + return nil, err } // Init manifest diff --git a/gcf/custom/manifest_test.go b/gcf/custom/manifest_test.go index 6da50dc..d7887e3 100644 --- a/gcf/custom/manifest_test.go +++ b/gcf/custom/manifest_test.go @@ -36,10 +36,10 @@ func (g *TestReader) ListObjects( func (g *TestReader) ReadObject( ctx context.Context, bucket, object string) ([]byte, error) { - if bucket == "test-bucket" && object == "parent/demo/data/provenance.json" { + if bucket == "test-bucket" && object == "parent/sustainable_systems_lab/data/provenance.json" { return []byte(`{"name":"data source", "url": "test.com"}`), nil } - if bucket == "test-bucket" && object == "parent/demo/data/import1/provenance.json" { + if bucket == "test-bucket" && object == "parent/sustainable_systems_lab/data/import1/provenance.json" { return []byte(`{"name":"foo_dataset", "url": "test.com/bar"}`), nil } return []byte{}, nil @@ -55,7 +55,7 @@ func TestComputeManifest(t *testing.T) { { "test-bucket", &Layout{ - root: "parent/demo", + root: "parent/sustainable_systems_lab", prov: "provenance.json", imports: map[string]*Import{ "import1": { diff --git a/gcf/custom/test_data/golden.textproto b/gcf/custom/test_data/golden.textproto index 3a37ac3..725baab 100644 --- a/gcf/custom/test_data/golden.textproto +++ b/gcf/custom/test_data/golden.textproto @@ -2,52 +2,52 @@ import: { import_name: "foo_dataset" provenance_url: "test.com/bar" category: STATS - import_groups: "demo" - mcf_proto_url: "/bigstore/test-bucket/parent/demo/data/import1/ocean/graph.tfrecord@*.gz" - mcf_proto_url: "/bigstore/test-bucket/parent/demo/data/import1/smokepm/graph.tfrecord@*.gz" + import_groups: "sustainablesystemsla" + mcf_proto_url: "/bigstore/test-bucket/parent/sustainable_systems_lab/data/import1/ocean/graph.tfrecord@*.gz" + mcf_proto_url: "/bigstore/test-bucket/parent/sustainable_systems_lab/data/import1/smokepm/graph.tfrecord@*.gz" table: { - mapping_path: "/bigstore/test-bucket/parent/demo/data/import1/ocean/sea.tmcf" - csv_path: "/bigstore/test-bucket/parent/demo/data/import1/ocean/river.csv" - csv_path: "/bigstore/test-bucket/parent/demo/data/import1/ocean/lake.csv" + mapping_path: "/bigstore/test-bucket/parent/sustainable_systems_lab/data/import1/ocean/sea.tmcf" + csv_path: "/bigstore/test-bucket/parent/sustainable_systems_lab/data/import1/ocean/river.csv" + csv_path: "/bigstore/test-bucket/parent/sustainable_systems_lab/data/import1/ocean/lake.csv" } table: { - mapping_path: "/bigstore/test-bucket/parent/demo/data/import1/smokepm/data.tmcf" - csv_path: "/bigstore/test-bucket/parent/demo/data/import1/smokepm/1.csv" - csv_path: "/bigstore/test-bucket/parent/demo/data/import1/smokepm/2.csv" + mapping_path: "/bigstore/test-bucket/parent/sustainable_systems_lab/data/import1/smokepm/data.tmcf" + csv_path: "/bigstore/test-bucket/parent/sustainable_systems_lab/data/import1/smokepm/1.csv" + csv_path: "/bigstore/test-bucket/parent/sustainable_systems_lab/data/import1/smokepm/2.csv" } resolution_info: { uses_id_resolver: true } - automated_mcf_generation_by: "demo" + automated_mcf_generation_by: "sustainablesystemsla" dataset_name: "foo_dataset" } import: { import_name: "import2" provenance_url: "test.com" category: STATS - import_groups: "demo" - mcf_proto_url: "/bigstore/test-bucket/parent/demo/data/import2/solar/graph.tfrecord@*.gz" + import_groups: "sustainablesystemsla" + mcf_proto_url: "/bigstore/test-bucket/parent/sustainable_systems_lab/data/import2/solar/graph.tfrecord@*.gz" table: { - mapping_path: "/bigstore/test-bucket/parent/demo/data/import2/solar/data.tmcf" - csv_path: "/bigstore/test-bucket/parent/demo/data/import2/solar/output.csv" + mapping_path: "/bigstore/test-bucket/parent/sustainable_systems_lab/data/import2/solar/data.tmcf" + csv_path: "/bigstore/test-bucket/parent/sustainable_systems_lab/data/import2/solar/output.csv" } resolution_info: { uses_id_resolver: true } - automated_mcf_generation_by: "demo" + automated_mcf_generation_by: "sustainablesystemsla" dataset_name: "import2" } import: { import_name: "schema" provenance_url: "test.com" category: SCHEMA - mcf_url: "/bigstore/test-bucket/parent/demo/data/import1/*.mcf*" - mcf_url: "/bigstore/test-bucket/parent/demo/data/import2/*.mcf*" - import_groups: "demo" - dataset_name: "demo" + mcf_url: "/bigstore/test-bucket/parent/sustainable_systems_lab/data/import1/*.mcf*" + mcf_url: "/bigstore/test-bucket/parent/sustainable_systems_lab/data/import2/*.mcf*" + import_groups: "sustainablesystemsla" + dataset_name: "sustainablesystemsla" } import_groups: { - name: "demo" + name: "sustainablesystemsla" description: "Custom DC import group" is_custom_dc: true }