diff --git a/docs/src/file-formats.md b/docs/src/file-formats.md index 3af248ce57..7064b9b49f 100644 --- a/docs/src/file-formats.md +++ b/docs/src/file-formats.md @@ -130,6 +130,74 @@ In particular, no encode/decode of `\r`, `\n`, `\t`, or `\\` is done. * CSV-lite allows changing FS and/or RS to any values, perhaps multi-character. +* CSV-lite and TSV-lite handle schema changes ("schema" meaning "ordered list of field names in a given record") by adding a newline and re-emitting the header. CSV and TSV, by contrast, do the following: + * If there are too few keys, but these match the header, empty fields are emitted. + * If there are too many keys, but these match the header up to the number of header fields, the extra fields are emitted. + * If keys don't match the header, this is an error. + +
+cat data/under-over.json ++
+[ + { "a": 1, "b": 2, "c": 3 }, + { "a": 4, "b": 5, "c": 6, "d": 7 }, + { "a": 7, "b": 8 }, + { "a": 9, "b": 10, "c": 11 } +] ++ +
+mlr --ijson --ocsvlite cat data/under-over.json ++
+a,b,c +1,2,3 + +a,b,c,d +4,5,6,7 + +a,b +7,8 + +a,b,c +9,10,11 ++ +
+mlr --ijson --ocsvlite cat data/key-change.json ++
+a,b,c +1,2,3 +4,5,6 + +a,X,c +7,8,9 ++ +
+mlr --ijson --ocsv cat data/under-over.json ++
+a,b,c +1,2,3 +4,5,6,7 +7,8, +9,10,11 ++ +
+mlr --ijson --ocsv cat data/key-change.json ++
+a,b,c +1,2,3 +4,5,6 +mlr: CSV schema change: first keys "a,b,c"; current keys "a,X,c" +mlr: exiting due to data error. ++ * In short, use-cases for CSV-lite and TSV-lite are often found when dealing with CSV/TSV files which are formatted in some non-standard way -- you have a little more flexibility available to you. (As an example of this flexibility: ASV and USV are nothing more than CSV-lite with different values for FS and RS.) CSV, TSV, CSV-lite, and TSV-lite have in common the `--implicit-csv-header` flag for input and the `--headerless-csv-output` flag for output. diff --git a/docs/src/file-formats.md.in b/docs/src/file-formats.md.in index 601c1bc607..36365a1fb2 100644 --- a/docs/src/file-formats.md.in +++ b/docs/src/file-formats.md.in @@ -42,7 +42,7 @@ In particular, no encode/decode of `\r`, `\n`, `\t`, or `\\` is done. * CSV-lite allows changing FS and/or RS to any values, perhaps multi-character. -* CSV-lite handles schema changes ("schema" meaning "ordered list of field names in a given record") by adding a newline and re-emitting the header. CSV, by contrast, does the following: +* CSV-lite and TSV-lite handle schema changes ("schema" meaning "ordered list of field names in a given record") by adding a newline and re-emitting the header. CSV and TSV, by contrast, do the following: * If there are too few keys, but these match the header, empty fields are emitted. * If there are too many keys, but these match the header up to the number of header fields, the extra fields are emitted. * If keys don't match the header, this is an error. @@ -55,7 +55,7 @@ GENMD-RUN-COMMAND mlr --ijson --ocsvlite cat data/under-over.json GENMD-EOF -GENMD-RUN-COMMAND +GENMD-RUN-COMMAND-TOLERATING-ERROR mlr --ijson --ocsvlite cat data/key-change.json GENMD-EOF @@ -63,7 +63,7 @@ GENMD-RUN-COMMAND mlr --ijson --ocsv cat data/under-over.json GENMD-EOF -GENMD-RUN-COMMAND +GENMD-RUN-COMMAND-TOLERATING-ERROR mlr --ijson --ocsv cat data/key-change.json GENMD-EOF diff --git a/docs/src/questions-about-joins.md b/docs/src/questions-about-joins.md index b8bde2d46d..e3974877ed 100644 --- a/docs/src/questions-about-joins.md +++ b/docs/src/questions-about-joins.md @@ -118,9 +118,7 @@ However, if we ask for left-unpaireds, since there's no `color` column, we get a id,code,color 4,ff0000,red 2,00ff00,green - -id,code -3,0000ff +3,0000ff, To fix this, we can use **unsparsify**: diff --git a/docs/src/record-heterogeneity.md b/docs/src/record-heterogeneity.md index d02a524482..de96ae69cd 100644 --- a/docs/src/record-heterogeneity.md +++ b/docs/src/record-heterogeneity.md @@ -375,13 +375,12 @@ record_count=150,resource=/path/to/second/file CSV and pretty-print formats expect rectangular structure. But Miller lets you process non-rectangular using CSV and pretty-print. -Miller simply prints a newline and a new header when there is a schema change --- where by _schema_ we mean simply the list of record keys in the order they -are encountered. When there is no schema change, you get CSV per se as a -special case. Likewise, Miller reads heterogeneous CSV or pretty-print input -the same way. The difference between CSV and CSV-lite is that the former is -[RFC-4180-compliant](file-formats.md#csvtsvasvusvetc), while the latter readily -handles heterogeneous data (which is non-compliant). For example: +For CSV-lite and TSV-lite, Miller simply prints a newline and a new header when there is a schema +change -- where by _schema_ we mean simply the list of record keys in the order they are +encountered. When there is no schema change, you get CSV per se as a special case. Likewise, Miller +reads heterogeneous CSV or pretty-print input the same way. The difference between CSV and CSV-lite +is that the former is [RFC-4180-compliant](file-formats.md#csvtsvasvusvetc), while the latter +readily handles heterogeneous data (which is non-compliant). For example:
cat data/het.json @@ -446,19 +445,43 @@ record_count resource 150 /path/to/second/file-Miller handles explicit header changes as just shown. If your CSV input contains ragged data -- if there are implicit header changes (no intervening blank line and new header line) as seen above -- you can use `--allow-ragged-csv-input` (or keystroke-saver `--ragged`). +
+mlr --ijson --ocsvlite group-like data/het.json ++
+resource,loadsec,ok +/path/to/file,0.45,true +/path/to/second/file,0.32,true +/some/other/path,0.97,false + +record_count,resource +100,/path/to/file +150,/path/to/second/file +
-mlr --csv --ragged cat data/het/ragged.csv +mlr --ijson --ocsv group-like data/het.json
-a,b,c -1,2,3 +resource,loadsec,ok +/path/to/file,0.45,true +/path/to/second/file,0.32,true +/some/other/path,0.97,false +mlr: CSV schema change: first keys "resource,loadsec,ok"; current keys "record_count,resource" +mlr: exiting due to data error. +-a,b -4,5 +Miller handles explicit header changes as just shown. If your CSV input contains ragged data -- if +there are implicit header changes (no intervening blank line and new header line) as seen above -- +you can use `--allow-ragged-csv-input` (or keystroke-saver `--ragged`). -a,b,c,4 +
+mlr --csv --allow-ragged-csv-input cat data/het/ragged.csv ++
+a,b,c +1,2,3 +4,5, 7,8,9,10diff --git a/docs/src/record-heterogeneity.md.in b/docs/src/record-heterogeneity.md.in index 1aab9dfaae..677098ee87 100644 --- a/docs/src/record-heterogeneity.md.in +++ b/docs/src/record-heterogeneity.md.in @@ -180,13 +180,12 @@ GENMD-EOF CSV and pretty-print formats expect rectangular structure. But Miller lets you process non-rectangular using CSV and pretty-print. -Miller simply prints a newline and a new header when there is a schema change --- where by _schema_ we mean simply the list of record keys in the order they -are encountered. When there is no schema change, you get CSV per se as a -special case. Likewise, Miller reads heterogeneous CSV or pretty-print input -the same way. The difference between CSV and CSV-lite is that the former is -[RFC-4180-compliant](file-formats.md#csvtsvasvusvetc), while the latter readily -handles heterogeneous data (which is non-compliant). For example: +For CSV-lite and TSV-lite, Miller simply prints a newline and a new header when there is a schema +change -- where by _schema_ we mean simply the list of record keys in the order they are +encountered. When there is no schema change, you get CSV per se as a special case. Likewise, Miller +reads heterogeneous CSV or pretty-print input the same way. The difference between CSV and CSV-lite +is that the former is [RFC-4180-compliant](file-formats.md#csvtsvasvusvetc), while the latter +readily handles heterogeneous data (which is non-compliant). For example: GENMD-RUN-COMMAND cat data/het.json @@ -200,10 +199,20 @@ GENMD-RUN-COMMAND mlr --ijson --opprint group-like data/het.json GENMD-EOF -Miller handles explicit header changes as just shown. If your CSV input contains ragged data -- if there are implicit header changes (no intervening blank line and new header line) as seen above -- you can use `--allow-ragged-csv-input` (or keystroke-saver `--ragged`). +GENMD-RUN-COMMAND +mlr --ijson --ocsvlite group-like data/het.json +GENMD-EOF GENMD-RUN-COMMAND-TOLERATING-ERROR -mlr --csv --ragged cat data/het/ragged.csv +mlr --ijson --ocsv group-like data/het.json +GENMD-EOF + +Miller handles explicit header changes as just shown. If your CSV input contains ragged data -- if +there are implicit header changes (no intervening blank line and new header line) as seen above -- +you can use `--allow-ragged-csv-input` (or keystroke-saver `--ragged`). + +GENMD-RUN-COMMAND +mlr --csv --allow-ragged-csv-input cat data/het/ragged.csv GENMD-EOF ## Processing heterogeneous data diff --git a/pkg/output/record_writer_tsv.go b/pkg/output/record_writer_tsv.go index ecb67fd7d4..2a79793b2a 100644 --- a/pkg/output/record_writer_tsv.go +++ b/pkg/output/record_writer_tsv.go @@ -12,11 +12,10 @@ import ( ) type RecordWriterTSV struct { - writerOptions *cli.TWriterOptions - // For reporting schema changes: we print a newline and the new header - lastJoinedHeader *string - // Only write one blank line for schema changes / blank input lines - justWroteEmptyLine bool + writerOptions *cli.TWriterOptions + needToPrintHeader bool + firstRecordKeys []string + firstRecordNF int64 } func NewRecordWriterTSV(writerOptions *cli.TWriterOptions) (*RecordWriterTSV, error) { @@ -27,9 +26,10 @@ func NewRecordWriterTSV(writerOptions *cli.TWriterOptions) (*RecordWriterTSV, er return nil, fmt.Errorf("for CSV, ORS cannot be altered") } return &RecordWriterTSV{ - writerOptions: writerOptions, - lastJoinedHeader: nil, - justWroteEmptyLine: false, + writerOptions: writerOptions, + needToPrintHeader: !writerOptions.HeaderlessOutput, + firstRecordKeys: nil, + firstRecordNF: -1, }, nil } @@ -43,36 +43,22 @@ func (writer *RecordWriterTSV) Write( return nil } - if outrec.IsEmpty() { - if !writer.justWroteEmptyLine { - bufferedOutputStream.WriteString(writer.writerOptions.ORS) - } - joinedHeader := "" - writer.lastJoinedHeader = &joinedHeader - writer.justWroteEmptyLine = true - return nil + if writer.firstRecordKeys == nil { + writer.firstRecordKeys = outrec.GetKeys() + writer.firstRecordNF = int64(len(writer.firstRecordKeys)) } - needToPrintHeader := false - joinedHeader := strings.Join(outrec.GetKeys(), ",") - if writer.lastJoinedHeader == nil || *writer.lastJoinedHeader != joinedHeader { - if writer.lastJoinedHeader != nil { - if !writer.justWroteEmptyLine { - bufferedOutputStream.WriteString(writer.writerOptions.ORS) - } - writer.justWroteEmptyLine = true + if writer.needToPrintHeader { + fields := make([]string, outrec.FieldCount) + i := 0 + for pe := outrec.Head; pe != nil; pe = pe.Next { + fields[i] = pe.Key + i++ } - writer.lastJoinedHeader = &joinedHeader - needToPrintHeader = true - } - - if needToPrintHeader && !writer.writerOptions.HeaderlessOutput { for pe := outrec.Head; pe != nil; pe = pe.Next { bufferedOutputStream.WriteString( colorizer.MaybeColorizeKey( - lib.TSVEncodeField( - pe.Key, - ), + lib.TSVEncodeField(pe.Key), outputIsStdout, ), ) @@ -83,24 +69,44 @@ func (writer *RecordWriterTSV) Write( } bufferedOutputStream.WriteString(writer.writerOptions.ORS) + + writer.needToPrintHeader = false + } + + var outputNF int64 = outrec.FieldCount + if outputNF < writer.firstRecordNF { + outputNF = writer.firstRecordNF } + fields := make([]string, outputNF) + var i int64 = 0 for pe := outrec.Head; pe != nil; pe = pe.Next { - bufferedOutputStream.WriteString( - colorizer.MaybeColorizeValue( - lib.TSVEncodeField( - pe.Value.String(), - ), - outputIsStdout, - ), + if i < writer.firstRecordNF && pe.Key != writer.firstRecordKeys[i] { + return fmt.Errorf( + "TSV schema change: first keys \"%s\"; current keys \"%s\"", + strings.Join(writer.firstRecordKeys, writer.writerOptions.OFS), + strings.Join(outrec.GetKeys(), writer.writerOptions.OFS), + ) + } + fields[i] = colorizer.MaybeColorizeValue( + lib.TSVEncodeField(pe.Value.String()), + outputIsStdout, ) - if pe.Next != nil { + i++ + } + + for ; i < outputNF; i++ { + fields[i] = "" + } + + for j, field := range fields { + if j > 0 { bufferedOutputStream.WriteString(writer.writerOptions.OFS) } + bufferedOutputStream.WriteString(field) } - bufferedOutputStream.WriteString(writer.writerOptions.ORS) - writer.justWroteEmptyLine = false + bufferedOutputStream.WriteString(writer.writerOptions.ORS) return nil } diff --git a/test/cases/io-tsv-auto-unsparsify/at/cmd b/test/cases/io-tsv-auto-unsparsify/at/cmd new file mode 100644 index 0000000000..818cba82b4 --- /dev/null +++ b/test/cases/io-tsv-auto-unsparsify/at/cmd @@ -0,0 +1 @@ +mlr -i json -o tsv cat ${CASEDIR}/input.json diff --git a/test/cases/io-tsv-auto-unsparsify/at/experr b/test/cases/io-tsv-auto-unsparsify/at/experr new file mode 100644 index 0000000000..e69de29bb2 diff --git a/test/cases/io-tsv-auto-unsparsify/at/expout b/test/cases/io-tsv-auto-unsparsify/at/expout new file mode 100644 index 0000000000..c0232182d7 --- /dev/null +++ b/test/cases/io-tsv-auto-unsparsify/at/expout @@ -0,0 +1,4 @@ +a b c +1 2 3 +4 5 6 +7 8 9 diff --git a/test/cases/io-tsv-auto-unsparsify/at/input.json b/test/cases/io-tsv-auto-unsparsify/at/input.json new file mode 100644 index 0000000000..832be9c9e2 --- /dev/null +++ b/test/cases/io-tsv-auto-unsparsify/at/input.json @@ -0,0 +1,17 @@ +[ +{ + "a": 1, + "b": 2, + "c": 3 +}, +{ + "a": 4, + "b": 5, + "c": 6 +}, +{ + "a": 7, + "b": 8, + "c": 9 +} +] diff --git a/test/cases/io-tsv-auto-unsparsify/key-change/cmd b/test/cases/io-tsv-auto-unsparsify/key-change/cmd new file mode 100644 index 0000000000..818cba82b4 --- /dev/null +++ b/test/cases/io-tsv-auto-unsparsify/key-change/cmd @@ -0,0 +1 @@ +mlr -i json -o tsv cat ${CASEDIR}/input.json diff --git a/test/cases/io-tsv-auto-unsparsify/key-change/experr b/test/cases/io-tsv-auto-unsparsify/key-change/experr new file mode 100644 index 0000000000..ce615563a8 --- /dev/null +++ b/test/cases/io-tsv-auto-unsparsify/key-change/experr @@ -0,0 +1,2 @@ +mlr: TSV schema change: first keys "a b c"; current keys "a X c" +mlr: exiting due to data error. diff --git a/test/cases/io-tsv-auto-unsparsify/key-change/expout b/test/cases/io-tsv-auto-unsparsify/key-change/expout new file mode 100644 index 0000000000..c96a25f193 --- /dev/null +++ b/test/cases/io-tsv-auto-unsparsify/key-change/expout @@ -0,0 +1,3 @@ +a b c +1 2 3 +4 5 6 diff --git a/test/cases/io-tsv-auto-unsparsify/key-change/input.json b/test/cases/io-tsv-auto-unsparsify/key-change/input.json new file mode 100644 index 0000000000..841abab575 --- /dev/null +++ b/test/cases/io-tsv-auto-unsparsify/key-change/input.json @@ -0,0 +1,17 @@ +[ +{ + "a": 1, + "b": 2, + "c": 3 +}, +{ + "a": 4, + "b": 5, + "c": 6 +}, +{ + "a": 7, + "X": 8, + "c": 9 +} +] diff --git a/test/cases/io-tsv-auto-unsparsify/key-change/should-fail b/test/cases/io-tsv-auto-unsparsify/key-change/should-fail new file mode 100644 index 0000000000..e69de29bb2 diff --git a/test/cases/io-tsv-auto-unsparsify/over/cmd b/test/cases/io-tsv-auto-unsparsify/over/cmd new file mode 100644 index 0000000000..818cba82b4 --- /dev/null +++ b/test/cases/io-tsv-auto-unsparsify/over/cmd @@ -0,0 +1 @@ +mlr -i json -o tsv cat ${CASEDIR}/input.json diff --git a/test/cases/io-tsv-auto-unsparsify/over/experr b/test/cases/io-tsv-auto-unsparsify/over/experr new file mode 100644 index 0000000000..e69de29bb2 diff --git a/test/cases/io-tsv-auto-unsparsify/over/expout b/test/cases/io-tsv-auto-unsparsify/over/expout new file mode 100644 index 0000000000..0a61a24061 --- /dev/null +++ b/test/cases/io-tsv-auto-unsparsify/over/expout @@ -0,0 +1,4 @@ +a b c +1 2 3 +4 5 6 7 +7 8 9 diff --git a/test/cases/io-tsv-auto-unsparsify/over/input.json b/test/cases/io-tsv-auto-unsparsify/over/input.json new file mode 100644 index 0000000000..38b47c2f09 --- /dev/null +++ b/test/cases/io-tsv-auto-unsparsify/over/input.json @@ -0,0 +1,18 @@ +[ +{ + "a": 1, + "b": 2, + "c": 3 +}, +{ + "a": 4, + "b": 5, + "c": 6, + "d": 7 +}, +{ + "a": 7, + "b": 8, + "c": 9 +} +] diff --git a/test/cases/io-tsv-auto-unsparsify/under/cmd b/test/cases/io-tsv-auto-unsparsify/under/cmd new file mode 100644 index 0000000000..818cba82b4 --- /dev/null +++ b/test/cases/io-tsv-auto-unsparsify/under/cmd @@ -0,0 +1 @@ +mlr -i json -o tsv cat ${CASEDIR}/input.json diff --git a/test/cases/io-tsv-auto-unsparsify/under/experr b/test/cases/io-tsv-auto-unsparsify/under/experr new file mode 100644 index 0000000000..e69de29bb2 diff --git a/test/cases/io-tsv-auto-unsparsify/under/expout b/test/cases/io-tsv-auto-unsparsify/under/expout new file mode 100644 index 0000000000..7b24f5bdbf --- /dev/null +++ b/test/cases/io-tsv-auto-unsparsify/under/expout @@ -0,0 +1,4 @@ +a b c +1 2 3 +4 5 +7 8 9 diff --git a/test/cases/io-tsv-auto-unsparsify/under/input.json b/test/cases/io-tsv-auto-unsparsify/under/input.json new file mode 100644 index 0000000000..e90f7439ad --- /dev/null +++ b/test/cases/io-tsv-auto-unsparsify/under/input.json @@ -0,0 +1,16 @@ +[ +{ + "a": 1, + "b": 2, + "c": 3 +}, +{ + "a": 4, + "b": 5 +}, +{ + "a": 7, + "b": 8, + "c": 9 +} +]