From d0bd081ea9649bb1f5a4692803205f5aa1da9267 Mon Sep 17 00:00:00 2001 From: Nathan <148575555+nathan-artie@users.noreply.github.com> Date: Fri, 26 Jan 2024 12:27:55 -0800 Subject: [PATCH] Support postgres loads (#44) --- lib/postgres/iterator.go | 8 ++++-- lib/postgres/load.go | 59 ++++++++++++++++++++++++++++++++++++++++ main.go | 16 +++++++++-- 3 files changed, 77 insertions(+), 6 deletions(-) create mode 100644 lib/postgres/load.go diff --git a/lib/postgres/iterator.go b/lib/postgres/iterator.go index 9ba8462a..f1290932 100644 --- a/lib/postgres/iterator.go +++ b/lib/postgres/iterator.go @@ -19,7 +19,7 @@ const DefaultErrorRetries = 10 type TableIterator struct { db *sql.DB limit uint - statsD mtr.Client + statsD *mtr.Client maxRowSize uint64 postgresTable *Table firstRow bool @@ -27,7 +27,7 @@ type TableIterator struct { done bool } -func LoadTable(db *sql.DB, table *config.PostgreSQLTable, statsD mtr.Client, maxRowSize uint64) (TableIterator, error) { +func LoadTable(db *sql.DB, table *config.PostgreSQLTable, statsD *mtr.Client, maxRowSize uint64) (TableIterator, error) { slog.Info("Loading configuration for table", slog.String("table", table.Name), slog.Any("limitSize", table.GetLimit())) postgresTable := NewTable(table) @@ -109,7 +109,9 @@ func (i *TableIterator) Next() ([]kafkalib.RawMessage, error) { PartitionKey: partitionKeyMap, Payload: payload, }) - i.statsD.Timing("scanned_and_parsed", time.Since(start), i.statsDTags()) + if i.statsD != nil { + (*i.statsD).Timing("scanned_and_parsed", time.Since(start), i.statsDTags()) + } } // TODO: This should really be re-written and tested thoroughly diff --git a/lib/postgres/load.go b/lib/postgres/load.go new file mode 100644 index 00000000..1548f1e1 --- /dev/null +++ b/lib/postgres/load.go @@ -0,0 +1,59 @@ +package postgres + +import ( + "context" + "database/sql" + "log/slog" + "time" + + "github.com/artie-labs/reader/config" + "github.com/artie-labs/reader/lib/kafkalib" + "github.com/artie-labs/reader/lib/logger" + "github.com/artie-labs/reader/lib/mtr" +) + +func Run(ctx context.Context, cfg config.Settings, statsD *mtr.Client) { + slog.Info("Kafka config", + slog.Bool("aws", cfg.Kafka.AwsEnabled), + slog.String("kafkaBootstrapServer", cfg.Kafka.BootstrapServers), + slog.Any("publishSize", cfg.Kafka.GetPublishSize()), + slog.Uint64("maxRequestSize", cfg.Kafka.MaxRequestSize), + ) + kafkaWriter, err := kafkalib.NewBatchWriter(ctx, *cfg.Kafka) + if err != nil { + logger.Fatal("Failed to create kafka writer", slog.Any("err", err)) + } + + db, err := sql.Open("postgres", NewConnection(cfg.PostgreSQL).String()) + if err != nil { + logger.Fatal("Failed to connect to postgres", slog.Any("err", err)) + } + defer db.Close() + + for _, table := range cfg.PostgreSQL.Tables { + snapshotStartTime := time.Now() + iter, err := LoadTable(db, table, statsD, cfg.Kafka.MaxRequestSize) + if err != nil { + logger.Fatal("Failed to create table iterator", slog.Any("err", err), slog.String("table", table.Name)) + } + + var count int + for iter.HasNext() { + msgs, err := iter.Next() + if err != nil { + logger.Fatal("Failed to iterate over table", slog.Any("err", err), slog.String("table", table.Name)) + } else if len(msgs) > 0 { + if err = kafkaWriter.Write(msgs); err != nil { + logger.Fatal("Failed to write messages to kafka", slog.Any("err", err), slog.String("table", table.Name)) + } + count += len(msgs) + slog.Info("Scanning progress", slog.Duration("timing", time.Since(snapshotStartTime)), slog.Int("count", count)) + } + } + + slog.Info("Finished snapshotting", + slog.Int("scannedTotal", count), + slog.Duration("totalDuration", time.Since(snapshotStartTime)), + ) + } +} diff --git a/main.go b/main.go index 40089336..a5981b84 100644 --- a/main.go +++ b/main.go @@ -10,6 +10,7 @@ import ( "github.com/artie-labs/reader/lib/kafkalib" "github.com/artie-labs/reader/lib/logger" "github.com/artie-labs/reader/lib/mtr" + "github.com/artie-labs/reader/lib/postgres" "github.com/artie-labs/reader/sources/dynamodb" "github.com/getsentry/sentry-go" ) @@ -32,12 +33,21 @@ func main() { } ctx := config.InjectIntoContext(context.Background(), cfg) - ctx = kafkalib.InjectIntoContext(ctx) + var statsD *mtr.Client if cfg.Metrics != nil { slog.Info("Injecting datadog") ctx = mtr.InjectDatadogIntoCtx(ctx, cfg.Metrics.Namespace, cfg.Metrics.Tags, 0.5) + client := mtr.FromContext(ctx) + statsD = &client } - ddb := dynamodb.Load(*cfg) - ddb.Run(ctx) + switch cfg.Source { + case "", config.SourceDynamo: + // TODO: pull kafkalib out of context + ctx = kafkalib.InjectIntoContext(ctx) + ddb := dynamodb.Load(*cfg) + ddb.Run(ctx) + case config.SourcePostgreSQL: + postgres.Run(ctx, *cfg, statsD) + } }