Skip to content

Commit

Permalink
Strip trailing non-alphanumeric characters from importGroup when crea…
Browse files Browse the repository at this point in the history
…ting config.textproto (#276)

* Strip trailing non-alphanumeric characters from importGroup when creating config.textproto

* fix

* remove all non alphanumeric
  • Loading branch information
n-h-diaz authored Aug 30, 2024
1 parent bcf14e8 commit 9908b5a
Show file tree
Hide file tree
Showing 3 changed files with 57 additions and 28 deletions.
39 changes: 34 additions & 5 deletions gcf/custom/manifest.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,46 @@ package custom

import (
"context"
"fmt"
"path"
"path/filepath"
"sort"
"strings"

pb "github.com/datacommonsorg/tools/gcf/proto"
"google.golang.org/protobuf/encoding/protojson"
"google.golang.org/protobuf/proto"
)

func isAlphaNumeric(c byte) bool {
return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9')
}

func getImportGroupName(root string) (string, error) {
importGroup := path.Base(root)

// Remove alphanumeric characters since they aren't handled well by GCS.
var builder strings.Builder
for i := 0; i < len(importGroup); i++ {
if isAlphaNumeric(importGroup[i]) {
builder.WriteByte(importGroup[i])
}
}
result := builder.String()
if len(result) == 0 {
return "", fmt.Errorf("importGroup contains no alphanumeric characters: %s", root)
}

// IMPORTANT NOTE:
// importGroup has a character constraint of 20 due to internal limitations.
// https://cloud.google.com/storage-transfer/docs/known-limitations-transfer#tsop-max-path
if len(result) > 20 {
result = result[:20]
}

return result, nil
}

func computeTable(bucket, root, im, tab string, table *Table) *pb.ExternalTable {
bigstoreTmcf := filepath.Join("/bigstore", bucket, root, "data", im, tab, table.tmcf)
bigstoreCsv := []string{}
Expand Down Expand Up @@ -107,11 +138,9 @@ func ComputeManifest(
layout *Layout,
) (*pb.DataCommonsManifest, error) {
root := layout.root
importGroup := path.Base(root)
// IMPORTANT NOTE:
// importGroup has a character constraint of 20 due to internal limitations.
if len(importGroup) > 20 {
importGroup = importGroup[:20]
importGroup, err := getImportGroupName(root)
if err != nil {
return nil, err
}

// Init manifest
Expand Down
6 changes: 3 additions & 3 deletions gcf/custom/manifest_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,10 +36,10 @@ func (g *TestReader) ListObjects(

func (g *TestReader) ReadObject(
ctx context.Context, bucket, object string) ([]byte, error) {
if bucket == "test-bucket" && object == "parent/demo/data/provenance.json" {
if bucket == "test-bucket" && object == "parent/sustainable_systems_lab/data/provenance.json" {
return []byte(`{"name":"data source", "url": "test.com"}`), nil
}
if bucket == "test-bucket" && object == "parent/demo/data/import1/provenance.json" {
if bucket == "test-bucket" && object == "parent/sustainable_systems_lab/data/import1/provenance.json" {
return []byte(`{"name":"foo_dataset", "url": "test.com/bar"}`), nil
}
return []byte{}, nil
Expand All @@ -55,7 +55,7 @@ func TestComputeManifest(t *testing.T) {
{
"test-bucket",
&Layout{
root: "parent/demo",
root: "parent/sustainable_systems_lab",
prov: "provenance.json",
imports: map[string]*Import{
"import1": {
Expand Down
40 changes: 20 additions & 20 deletions gcf/custom/test_data/golden.textproto
Original file line number Diff line number Diff line change
Expand Up @@ -2,52 +2,52 @@ import: {
import_name: "foo_dataset"
provenance_url: "test.com/bar"
category: STATS
import_groups: "demo"
mcf_proto_url: "/bigstore/test-bucket/parent/demo/data/import1/ocean/graph.tfrecord@*.gz"
mcf_proto_url: "/bigstore/test-bucket/parent/demo/data/import1/smokepm/graph.tfrecord@*.gz"
import_groups: "sustainablesystemsla"
mcf_proto_url: "/bigstore/test-bucket/parent/sustainable_systems_lab/data/import1/ocean/graph.tfrecord@*.gz"
mcf_proto_url: "/bigstore/test-bucket/parent/sustainable_systems_lab/data/import1/smokepm/graph.tfrecord@*.gz"
table: {
mapping_path: "/bigstore/test-bucket/parent/demo/data/import1/ocean/sea.tmcf"
csv_path: "/bigstore/test-bucket/parent/demo/data/import1/ocean/river.csv"
csv_path: "/bigstore/test-bucket/parent/demo/data/import1/ocean/lake.csv"
mapping_path: "/bigstore/test-bucket/parent/sustainable_systems_lab/data/import1/ocean/sea.tmcf"
csv_path: "/bigstore/test-bucket/parent/sustainable_systems_lab/data/import1/ocean/river.csv"
csv_path: "/bigstore/test-bucket/parent/sustainable_systems_lab/data/import1/ocean/lake.csv"
}
table: {
mapping_path: "/bigstore/test-bucket/parent/demo/data/import1/smokepm/data.tmcf"
csv_path: "/bigstore/test-bucket/parent/demo/data/import1/smokepm/1.csv"
csv_path: "/bigstore/test-bucket/parent/demo/data/import1/smokepm/2.csv"
mapping_path: "/bigstore/test-bucket/parent/sustainable_systems_lab/data/import1/smokepm/data.tmcf"
csv_path: "/bigstore/test-bucket/parent/sustainable_systems_lab/data/import1/smokepm/1.csv"
csv_path: "/bigstore/test-bucket/parent/sustainable_systems_lab/data/import1/smokepm/2.csv"
}
resolution_info: {
uses_id_resolver: true
}
automated_mcf_generation_by: "demo"
automated_mcf_generation_by: "sustainablesystemsla"
dataset_name: "foo_dataset"
}
import: {
import_name: "import2"
provenance_url: "test.com"
category: STATS
import_groups: "demo"
mcf_proto_url: "/bigstore/test-bucket/parent/demo/data/import2/solar/graph.tfrecord@*.gz"
import_groups: "sustainablesystemsla"
mcf_proto_url: "/bigstore/test-bucket/parent/sustainable_systems_lab/data/import2/solar/graph.tfrecord@*.gz"
table: {
mapping_path: "/bigstore/test-bucket/parent/demo/data/import2/solar/data.tmcf"
csv_path: "/bigstore/test-bucket/parent/demo/data/import2/solar/output.csv"
mapping_path: "/bigstore/test-bucket/parent/sustainable_systems_lab/data/import2/solar/data.tmcf"
csv_path: "/bigstore/test-bucket/parent/sustainable_systems_lab/data/import2/solar/output.csv"
}
resolution_info: {
uses_id_resolver: true
}
automated_mcf_generation_by: "demo"
automated_mcf_generation_by: "sustainablesystemsla"
dataset_name: "import2"
}
import: {
import_name: "schema"
provenance_url: "test.com"
category: SCHEMA
mcf_url: "/bigstore/test-bucket/parent/demo/data/import1/*.mcf*"
mcf_url: "/bigstore/test-bucket/parent/demo/data/import2/*.mcf*"
import_groups: "demo"
dataset_name: "demo"
mcf_url: "/bigstore/test-bucket/parent/sustainable_systems_lab/data/import1/*.mcf*"
mcf_url: "/bigstore/test-bucket/parent/sustainable_systems_lab/data/import2/*.mcf*"
import_groups: "sustainablesystemsla"
dataset_name: "sustainablesystemsla"
}
import_groups: {
name: "demo"
name: "sustainablesystemsla"
description: "Custom DC import group"
is_custom_dc: true
}
Expand Down

0 comments on commit 9908b5a

Please sign in to comment.