Skip to content

Commit

Permalink
feat(scan): alias and filters (#4)
Browse files Browse the repository at this point in the history
* feat(scan): alias and filters

* test(scan): fix venom tests

* fix: self reference optimization

* feat(scan): rename only to include

* feat(dump): wip! include only fields

* feat(dump): wip! add include flag

* feat(dump): include flag + detect complete entity

* feat(dump): add dump observers

* feat(dump): add watch flag

* feat(dump): rename incomplete to consistent
  • Loading branch information
adrienaury authored Mar 27, 2024
1 parent 72172c8 commit ba2a17c
Show file tree
Hide file tree
Showing 12 changed files with 413 additions and 50 deletions.
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,13 @@ Types of changes
- `Fixed` for any bug fixes.
- `Security` in case of vulnerabilities.

## [0.2.0]

- `Added` flag `--include` (short `-i`) to only scan/dump a specific list of fields, this flag is repeatable
- `Added` flag `--alias` (short `-a`) to rename fields on the fly, this flag is repeatable
- `Added` flag `--watch` (short `-w`) to the dump command
- `Fixed` self reference link are no longer counted in the links counter while scanning

## [0.1.0]

- `Added` initial version
28 changes: 28 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,34 @@ $ silo scan my-silo < input.jsonl

Analysis data is persisted on disk on the `my-silo` path relative to the current directory.

#### passthrough stdin to stdout

Use `--passthrough` (short : `-p`) to pass input to stdout instead of diplaying informations.

```console
$ silo scan my-silo --passthrough < input.jsonl
{"ID_CLIENT":"0001","EMAIL_CLIENT":"[email protected]","ACCOUNT_NUMBER":null}
{"ID_CLIENT":null,"EMAIL_CLIENT":null,"ACCOUNT_NUMBER":"C01"}
```

#### include only specific fields/columns

Use `--include <fieldname>` (short : `-i <fieldname>`, repeatable) to select only given columns to scan.

```console
$ silo scan my-silo --include ID_CLIENT --include EMAIL_CLIENT < input.jsonl
⣾ Scanned 5 rows, found 15 links (4084 row/s) [0s]
```

#### rename fields/columns on the fly

Use `--alias <fieldname>=<alias>` (short : `-a <fieldname>=<alias>`, repeatable) to rename fields before storing links.

```console
$ silo scan my-silo --alias ID_CLIENT=CLIENT --alias EMAIL_CLIENT=EMAIL < input.jsonl
⣾ Scanned 5 rows, found 15 links (4084 row/s) [0s]
```

### silo dump

The silo dump command is used to dump each connected entity into a file. This allows users to create a referential of all entities discovered within the JSONLine data. Here's how to use it:
Expand Down
25 changes: 21 additions & 4 deletions internal/app/cli/dump.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,36 +28,53 @@ import (
)

func NewDumpCommand(parent string, stderr *os.File, stdout *os.File, stdin *os.File) *cobra.Command {
var (
include []string
watch bool
)

cmd := &cobra.Command{ //nolint:exhaustruct
Use: "dump path",
Short: "Dump silo database stored in given path into stdout",
Example: " " + parent + " dump clients",
Args: cobra.ExactArgs(1),
Run: func(_ *cobra.Command, args []string) {
if err := dump(args[0]); err != nil {
if err := dump(args[0], include, watch); err != nil {
log.Fatal().Err(err).Int("return", 1).Msg("end SILO")
}
},
}

cmd.Flags().StringSliceVarP(&include, "include", "i", []string{}, "include only these columns, exclude all others")
cmd.Flags().BoolVarP(&watch, "watch", "w", false, "watch statistics about dumped entities in stderr")

cmd.Flags().SortFlags = false

cmd.SetOut(stdout)
cmd.SetErr(stderr)
cmd.SetIn(stdin)

return cmd
}

func dump(path string) error {
func dump(path string, include []string, watch bool) error {
backend, err := infra.NewBackend(path)
if err != nil {
return fmt.Errorf("%w", err)
}

defer backend.Close()

driver := silo.NewDriver(backend, infra.NewDumpJSONLine())
driver := silo.NewDriver(backend, infra.NewDumpJSONLine(), silo.WithKeys(include))

if watch {
observer := infra.NewDumpObserver()
defer observer.Close()

if err := driver.Dump(); err != nil {
if err := driver.Dump(observer); err != nil {
return fmt.Errorf("%w", err)
}
} else if err := driver.Dump(); err != nil {
return fmt.Errorf("%w", err)
}

Expand Down
18 changes: 13 additions & 5 deletions internal/app/cli/scan.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,21 +28,29 @@ import (
)

func NewScanCommand(parent string, stderr *os.File, stdout *os.File, stdin *os.File) *cobra.Command {
var passthrough bool
var (
passthrough bool
include []string
aliases map[string]string
)

cmd := &cobra.Command{ //nolint:exhaustruct
Use: "scan path",
Short: "Ingest data from stdin and update silo database stored in given path",
Example: " lino pull database --table client | " + parent + " scan clients",
Example: " " + parent + " scan clients < clients.jsonl",
Args: cobra.ExactArgs(1),
Run: func(cmd *cobra.Command, args []string) {
if err := scan(cmd, args[0], passthrough); err != nil {
if err := scan(cmd, args[0], passthrough, include, aliases); err != nil {
log.Fatal().Err(err).Int("return", 1).Msg("end SILO")
}
},
}

cmd.Flags().BoolVarP(&passthrough, "passthrough", "p", false, "pass stdin to stdout")
cmd.Flags().StringSliceVarP(&include, "include", "i", []string{}, "include only these columns, exclude all others")
cmd.Flags().StringToStringVarP(&aliases, "alias", "a", map[string]string{}, "use given aliases for each columns")

cmd.Flags().SortFlags = false

cmd.SetOut(stdout)
cmd.SetErr(stderr)
Expand All @@ -51,15 +59,15 @@ func NewScanCommand(parent string, stderr *os.File, stdout *os.File, stdin *os.F
return cmd
}

func scan(cmd *cobra.Command, path string, passthrough bool) error {
func scan(cmd *cobra.Command, path string, passthrough bool, include []string, aliases map[string]string) error {
backend, err := infra.NewBackend(path)
if err != nil {
return fmt.Errorf("%w", err)
}

defer backend.Close()

driver := silo.NewDriver(backend, nil)
driver := silo.NewDriver(backend, nil, silo.WithKeys(include), silo.WithAliases(aliases))

var reader silo.DataRowReader

Expand Down
87 changes: 87 additions & 0 deletions internal/infra/dump_observer.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
// Copyright (C) 2024 CGI France
//
// This file is part of SILO.
//
// SILO is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// SILO is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with SILO. If not, see <http://www.gnu.org/licenses/>.

package infra

import (
"fmt"
"os"
"time"

"github.com/cgi-fr/silo/pkg/silo"
"github.com/schollz/progressbar/v3"
)

type DumpObserver struct {
countTotal int
countComplete int
countConsistent int
countInconsistent int
countEmpty int
bar *progressbar.ProgressBar
}

func NewDumpObserver() *DumpObserver {
//nolint:gomnd
pgb := progressbar.NewOptions(-1,
progressbar.OptionSetDescription("Dumping ... "),
progressbar.OptionSetItsString("entity"),
progressbar.OptionSetWriter(os.Stderr),
progressbar.OptionShowIts(),
progressbar.OptionSpinnerType(11),
progressbar.OptionThrottle(time.Millisecond*10),
progressbar.OptionOnCompletion(func() { fmt.Fprintln(os.Stderr) }),
// progressbar.OptionShowDescriptionAtLineEnd(),
)

return &DumpObserver{
countTotal: 0,
countComplete: 0,
countConsistent: 0,
countInconsistent: 0,
countEmpty: 0,
bar: pgb,
}
}

func (o *DumpObserver) Entity(status silo.Status, _ map[string]int) {
o.countTotal++

switch status {
case silo.StatusEntityComplete:
o.countComplete++
case silo.StatusEntityConsistent:
o.countConsistent++
case silo.StatusEntityInconsistent:
o.countInconsistent++
case silo.StatusEntityEmpty:
o.countEmpty++
}

_ = o.bar.Add(1)

o.bar.Describe(fmt.Sprintf("Dumped %d entities / complete=%d / consistent=%d / inconsistent=%d / empty=%d",
o.countTotal,
o.countComplete,
o.countConsistent,
o.countInconsistent,
o.countEmpty))
}

func (o *DumpObserver) Close() {
_ = o.bar.Close()
}
52 changes: 52 additions & 0 deletions pkg/silo/config.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
// Copyright (C) 2024 CGI France
//
// This file is part of SILO.
//
// SILO is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// SILO is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with SILO. If not, see <http://www.gnu.org/licenses/>.

package silo

import "errors"

type config struct {
include map[string]bool
includeList []string
aliases map[string]string
}

func newConfig() *config {
config := config{
include: map[string]bool{},
includeList: []string{},
aliases: map[string]string{},
}

return &config
}

func (cfg *config) validate() error {
var errs []error

for key := range cfg.aliases {
if _, ok := cfg.include[key]; !ok && len(cfg.include) > 0 {
errs = append(errs, &ConfigScanAliasIsNotIncludedError{alias: key})
}
}

if len(errs) != 0 {
return errors.Join(errs...)
}

return nil
}
4 changes: 4 additions & 0 deletions pkg/silo/driven.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,3 +43,7 @@ type ScanObserver interface {
IngestedRow(row DataRow)
IngestedLink(link DataLink)
}

type DumpObserver interface {
Entity(status Status, counts map[string]int)
}
Loading

0 comments on commit ba2a17c

Please sign in to comment.