Skip to content

Commit

Permalink
Implement persistent data storage to reduce cold start time
Browse files Browse the repository at this point in the history
Signed-off-by: Lucas Rodriguez <[email protected]>
  • Loading branch information
lucasrod16 committed Sep 17, 2024
1 parent 59af21b commit 6042e2e
Show file tree
Hide file tree
Showing 11 changed files with 335 additions and 89 deletions.
33 changes: 33 additions & 0 deletions .github/workflows/cronjob.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
name: Write GitHub Data To Google Cloud Storage

on:
schedule:
- cron: "15 0 * * *"

pull_request:
branches: main

permissions:
contents: read
id-token: write

jobs:
validate-schema:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7

- name: Setup Go
uses: actions/setup-go@0a12ed9d6a96ab950c8f026ed9f722fe0da7ef32 # v5.0.2
with:
go-version-file: go.mod

- name: Google Cloud Auth
uses: google-github-actions/auth@62cf5bd3e4211a0a0b51f2c6d6a37129d828611d # v2.1.5
with:
project_id: "groovy-momentum-434802-g9"
workload_identity_provider: ${{ secrets.WORKLOAD_IDENTITY_PROVIDER }}

- name: Run CronJob
run: go run cmd/cronjob/main.go
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ go.work.sum
.env
data.json
tmp
gha-creds-*.json

# Terraform
# Local .terraform directories
Expand Down
16 changes: 0 additions & 16 deletions Makefile

This file was deleted.

94 changes: 31 additions & 63 deletions pkg/fetch.go → cmd/cronjob/main.go
Original file line number Diff line number Diff line change
@@ -1,21 +1,24 @@
package osscontribute
package main

import (
"context"
"encoding/json"
"fmt"
"log"
"os"
"slices"
"strings"
"time"

"cloud.google.com/go/storage"
"github.com/google/go-github/v64/github"
)

const jsonFile = "data.json"
const (
jsonFile = "data.json"
bucket = "lucasrod16-github-data"
)

type Repo struct {
type repo struct {
Name string `json:"name"`
Description string `json:"description"`
Owner string `json:"owner"`
Expand All @@ -25,42 +28,13 @@ type Repo struct {
Stars int `json:"stars"`
}

type Fetcher struct {
Client *github.Client
Cache *Cache
}

func NewFetcher() *Fetcher {
return &Fetcher{
Client: github.NewClient(nil),
Cache: NewCache(),
}
}
func main() {
ctx := context.Background()
client := github.NewClient(nil)

func (f *Fetcher) RepoData(ctx context.Context) error {
// check if there is existing data on disk to load from on boot.
// only log if there is an error since this could be the first boot.
fi, err := os.Stat(jsonFile)
licenses, err := licenseKeys(ctx, client)
if err != nil {
log.Println(err)
}

// cache JSON data if less than 24 hours old.
// only log the error since this is a best effort attempt.
if err == nil && time.Since(fi.ModTime()) < 24*time.Hour {
data, err := os.ReadFile(jsonFile)
if err != nil {
log.Println(err)
} else {
f.Cache.Set(data)
log.Println("Loaded existing, valid data into the cache. Skipping fetch...")
return nil
}
}

licenses, err := f.licenseKeys(ctx)
if err != nil {
return err
log.Fatal(err)
}

const baseQuery = "is:public archived:false good-first-issues:>=10 help-wanted-issues:>=10 stars:>=500"
Expand All @@ -69,24 +43,23 @@ func (f *Fetcher) RepoData(ctx context.Context) error {

// Map to deduplicate repos
// https://github.com/orgs/community/discussions/24361
repoMap := make(map[string]Repo)
repoMap := make(map[string]repo)

log.Println("fetching GitHub repo data...")
for page := 1; ; page++ {
opts.Page = page
opts.PerPage = 100

result, resp, err := f.Client.Search.Repositories(ctx, query, opts)
result, resp, err := client.Search.Repositories(ctx, query, opts)
if err != nil {
return fmt.Errorf("error searching repos: %w", err)
log.Fatalf("error searching repos: %v", err)
}

if len(result.Repositories) == 0 {
break
log.Fatal("unexpected error: no GitHub repositories found matching the specified search criteria")
}

for _, githubRepo := range result.Repositories {
repo := Repo{
repo := repo{
Name: githubRepo.GetName(),
Description: githubRepo.GetDescription(),
Owner: *githubRepo.Owner.Login,
Expand All @@ -103,45 +76,40 @@ func (f *Fetcher) RepoData(ctx context.Context) error {
}
}

var uniqueRepos []Repo
var uniqueRepos []repo
for _, repo := range repoMap {
uniqueRepos = append(uniqueRepos, repo)
}

// Sort repos by stars in descending order
slices.SortStableFunc(uniqueRepos, func(a, b Repo) int {
slices.SortStableFunc(uniqueRepos, func(a, b repo) int {
return b.Stars - a.Stars
})

data, err := json.MarshalIndent(uniqueRepos, "", " ")
if err != nil {
return err
log.Fatal(err)
}

tmpfile, err := os.CreateTemp(".", "data-*.json")
gcsClient, err := storage.NewClient(ctx)
if err != nil {
return err
log.Fatalf("failed to create GCS client: %v", err)
}
defer os.Remove(tmpfile.Name())
defer gcsClient.Close()

if _, err := tmpfile.Write(data); err != nil {
return err
w := gcsClient.Bucket(bucket).Object(jsonFile).NewWriter(ctx)
if _, err := w.Write(data); err != nil {
log.Fatalf("GCS Write error: %v", err)
}
tmpfile.Close()

if err := os.Rename(tmpfile.Name(), jsonFile); err != nil {
return err
if err := w.Close(); err != nil {
log.Fatalf("error closing GCS writer: %v", err)
}

f.Cache.Set(data)

log.Println("Successfully fetched and cached GitHub repo data")

return nil
log.Printf("Successfully fetched and stored GitHub repo data to GCS bucket: %s", bucket)
}

func (f *Fetcher) licenseKeys(ctx context.Context) (string, error) {
licenses, _, err := f.Client.Licenses.List(ctx)
func licenseKeys(ctx context.Context, client *github.Client) (string, error) {
licenses, _, err := client.Licenses.List(ctx)
if err != nil {
return "", fmt.Errorf("unable to list licenses: %w", err)
}
Expand Down
42 changes: 40 additions & 2 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,44 @@ module github.com/lucasrod16/oss-contribute

go 1.23.1

require github.com/google/go-github/v64 v64.0.0
require (
cloud.google.com/go/storage v1.43.0
github.com/google/go-github/v64 v64.0.0
)

require github.com/google/go-querystring v1.1.0 // indirect
require (
cloud.google.com/go v0.115.0 // indirect
cloud.google.com/go/auth v0.6.1 // indirect
cloud.google.com/go/auth/oauth2adapt v0.2.2 // indirect
cloud.google.com/go/compute/metadata v0.3.0 // indirect
cloud.google.com/go/iam v1.1.8 // indirect
github.com/felixge/httpsnoop v1.0.4 // indirect
github.com/go-logr/logr v1.4.1 // indirect
github.com/go-logr/stdr v1.2.2 // indirect
github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect
github.com/golang/protobuf v1.5.4 // indirect
github.com/google/go-querystring v1.1.0 // indirect
github.com/google/s2a-go v0.1.7 // indirect
github.com/google/uuid v1.6.0 // indirect
github.com/googleapis/enterprise-certificate-proxy v0.3.2 // indirect
github.com/googleapis/gax-go/v2 v2.12.5 // indirect
go.opencensus.io v0.24.0 // indirect
go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.49.0 // indirect
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.49.0 // indirect
go.opentelemetry.io/otel v1.24.0 // indirect
go.opentelemetry.io/otel/metric v1.24.0 // indirect
go.opentelemetry.io/otel/trace v1.24.0 // indirect
golang.org/x/crypto v0.24.0 // indirect
golang.org/x/net v0.26.0 // indirect
golang.org/x/oauth2 v0.21.0 // indirect
golang.org/x/sync v0.7.0 // indirect
golang.org/x/sys v0.21.0 // indirect
golang.org/x/text v0.16.0 // indirect
golang.org/x/time v0.5.0 // indirect
google.golang.org/api v0.187.0 // indirect
google.golang.org/genproto v0.0.0-20240624140628-dc46fd24d27d // indirect
google.golang.org/genproto/googleapis/api v0.0.0-20240617180043-68d350f18fd4 // indirect
google.golang.org/genproto/googleapis/rpc v0.0.0-20240624140628-dc46fd24d27d // indirect
google.golang.org/grpc v1.64.0 // indirect
google.golang.org/protobuf v1.34.2 // indirect
)
Loading

0 comments on commit 6042e2e

Please sign in to comment.