diff --git a/CHANGELOG.md b/CHANGELOG.md index ecc58f0..5c0b484 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,12 @@ -- [csvtk v0.28.1](https://github.com/shenwei356/csvtk/releases/tag/v0.28.1) -[![Github Releases (by Release)](https://img.shields.io/github/downloads/shenwei356/csvtk/v0.28.1/total.svg)](https://github.com/shenwei356/csvtk/releases/tag/v0.28.1) +- [csvtk v0.29.0](https://github.com/shenwei356/csvtk/releases/tag/v0.29.0) +[![Github Releases (by Release)](https://img.shields.io/github/downloads/shenwei356/csvtk/v0.29.0/total.svg)](https://github.com/shenwei356/csvtk/releases/tag/v0.29.0) + - new commands: + - [`fix-quotes`](https://bioinf.shenwei.me/csvtk/usage/#fix-quotes): fix malformed CSV/TSV caused by double-quotes. [#260](https://github.com/shenwei356/csvtk/issues/260) + - [`del-quotes`](https://bioinf.shenwei.me/csvtk/usage/#del-quotes): remove extra double-quotes added by `fix-quotes`. + - `csvtk del-header`: + - fix deleting headers of 2nd and later files. [#257](https://github.com/shenwei356/csvtk/issues/257) + - `csvtk concat`: + - fix panic when no data found. - `csvtk sort`: - support column name containing colons. [#254](https://github.com/shenwei356/csvtk/issues/254) - `csvtk filter2`: @@ -7,10 +14,8 @@ - fix specifying the position for the new column containing only a constant string. [#252](https://github.com/shenwei356/csvtk/issues/252) - `csvtk plot`: - add a new flag `--tick-label-size`. - - `csvtk del-header`: - - fix deleting headers of 2nd and later files. [#257](https://github.com/shenwei356/csvtk/issues/257) - - `csvtk concat`: - - fix panic when no data found. + - `csvtk pretty`: + - replace tabs with spaces. - [csvtk v0.28.0](https://github.com/shenwei356/csvtk/releases/tag/v0.28.0) [![Github Releases (by Release)](https://img.shields.io/github/downloads/shenwei356/csvtk/v0.28.0/total.svg)](https://github.com/shenwei356/csvtk/releases/tag/v0.28.0) - `csvtk`: diff --git a/README.md b/README.md index ac941f7..33ccced 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,8 @@ # csvtk - a cross-platform, efficient and practical CSV/TSV toolkit - **Documents:** [http://bioinf.shenwei.me/csvtk](http://bioinf.shenwei.me/csvtk/) -( [**Usage**](http://bioinf.shenwei.me/csvtk/usage/) and [**Tutorial**](http://bioinf.shenwei.me/csvtk/tutorial/)). [中文介绍](http://bioinf.shenwei.me/csvtk/chinese) +( [**Usage**](http://bioinf.shenwei.me/csvtk/usage/), [**Tutorial**](http://bioinf.shenwei.me/csvtk/tutorial/) and [**FAQs**](http://bioinf.shenwei.me/csvtk/faq/)). +[中文介绍](http://bioinf.shenwei.me/csvtk/chinese) - **Source code:** [https://github.com/shenwei356/csvtk](https://github.com/shenwei356/csvtk) [![GitHub stars](https://img.shields.io/github/stars/shenwei356/csvtk.svg?style=social&label=Star&?maxAge=2592000)](https://github.com/shenwei356/csvtk) [![license](https://img.shields.io/github/license/shenwei356/csvtk.svg?maxAge=2592000)](https://github.com/shenwei356/csvtk/blob/master/LICENSE) - **Latest version:** [![Latest Stable Version](https://img.shields.io/github/release/shenwei356/csvtk.svg?style=flat)](https://github.com/shenwei356/csvtk/releases) @@ -63,7 +64,7 @@ It could save you lots of time in (not) writing Python/R scripts. ## Subcommands -51 subcommands in total. +53 subcommands in total. **Information** @@ -108,6 +109,8 @@ It could save you lots of time in (not) writing Python/R scripts. **Edit** - [`fix`](https://bioinf.shenwei.me/csvtk/usage/#fix): fix CSV/TSV with different numbers of columns in rows +- [`fix-quotes`](https://bioinf.shenwei.me/csvtk/usage/#fix-quotes): fix malformed CSV/TSV caused by double-quotes +- [`del-quotes`](https://bioinf.shenwei.me/csvtk/usage/#del-quotes): remove extra double-quotes added by `fix-quotes` - [`add-header`](https://bioinf.shenwei.me/csvtk/usage/#add-header): add column names - [`del-header`](https://bioinf.shenwei.me/csvtk/usage/#del-header): delete column names - [`rename`](https://bioinf.shenwei.me/csvtk/usage/#rename): renames column names with new names diff --git a/csvtk/cmd/del-quotes.go b/csvtk/cmd/del-quotes.go new file mode 100644 index 0000000..d14b5ca --- /dev/null +++ b/csvtk/cmd/del-quotes.go @@ -0,0 +1,125 @@ +// Copyright © 2016-2023 Wei Shen +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +package cmd + +import ( + "fmt" + "runtime" + "strings" + "unicode" + "unicode/utf8" + + "github.com/shenwei356/xopen" + "github.com/spf13/cobra" +) + +// delQuotesCmd represents the csv2tab command +var delQuotesCmd = &cobra.Command{ + Use: "del-quotes", + Short: "remove extra double quotes added by 'fix-quotes'", + Long: `remove extra double quotes added by 'fix-quotes' + +Limitation: + 1. Values containing line breaks are not supported. + +`, + Run: func(cmd *cobra.Command, args []string) { + config := getConfigs(cmd) + files := getFileListFromArgsAndFile(cmd, args, true, "infile-list", true) + if len(files) > 1 { + checkError(fmt.Errorf("no more than one file should be given")) + } + runtime.GOMAXPROCS(config.NumCPUs) + + outfh, err := xopen.Wopen(config.OutFile) + checkError(err) + defer outfh.Close() + + if config.Tabs { + config.Delimiter = '\t' + } + + file := files[0] + csvReader, err := newCSVReaderByConfig(config, file) + if err != nil { + if err == xopen.ErrNoContent { + log.Warningf("csvtk csv2tab: skipping empty input file: %s", file) + return + } + checkError(err) + } + + csvReader.Read(ReadOption{ + FieldStr: "1-", + ShowRowNumber: config.ShowRowNumber, + }) + + d := string(config.Delimiter) + var i int + var v string + for record := range csvReader.Ch { + if record.Err != nil { + checkError(record.Err) + } + for i, v = range record.Selected { + // if fieldNeedsQuotes(v, config.Delimiter) { + if strings.Contains(v, d) { + record.Selected[i] = `"` + v + `"` + } + } + outfh.WriteString(strings.Join(record.Selected, d)) + outfh.WriteByte('\n') + } + + readerReport(&config, csvReader, file) + }, +} + +func init() { + RootCmd.AddCommand(delQuotesCmd) +} + +// copy from https://cs.opensource.google/go/go/+/refs/tags/go1.21.4:src/encoding/csv/writer.go;l=157 +func fieldNeedsQuotes(field string, comma rune) bool { + if field == "" { + return false + } + + if field == `\.` { + return true + } + + if comma < utf8.RuneSelf { + for i := 0; i < len(field); i++ { + c := field[i] + if c == '\n' || c == '\r' || c == '"' || c == byte(comma) { + return true + } + } + } else { + if strings.ContainsRune(field, comma) || strings.ContainsAny(field, "\"\r\n") { + return true + } + } + + r1, _ := utf8.DecodeRuneInString(field) + return unicode.IsSpace(r1) +} diff --git a/csvtk/cmd/fix-quotes.go b/csvtk/cmd/fix-quotes.go new file mode 100644 index 0000000..2a5c532 --- /dev/null +++ b/csvtk/cmd/fix-quotes.go @@ -0,0 +1,229 @@ +// Copyright © 2016-2023 Wei Shen +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +package cmd + +import ( + "bufio" + "bytes" + "fmt" + "regexp" + "runtime" + + "github.com/shenwei356/xopen" + "github.com/spf13/cobra" +) + +// fixquotesCmd represents the pretty command +var fixquotesCmd = &cobra.Command{ + Use: "fix-quotes", + Short: "fix malformed CSV/TSV caused by double-quotes", + Long: `fix malformed CSV/TSV caused by double-quotes + +This command fixes fields not appropriately enclosed by double-quotes +to meet the RFC4180 specification (https://rfc-editor.org/rfc/rfc4180.html). + +When and how to: + 1. Values containing bare double quotes. e.g., + a,abc" xyz,d + Error information: bare " in non-quoted-field. + Fix: adding the flag -l/--lazy-quotes. + Using this command: + a,abc" xyz,d -> a,"abc"" xyz",d + 2. Values with double quotes in the begining but not in the end. e.g., + a,"abc" xyz,d + Error information: extraneous or missing " in quoted-field. + Using this command: + a,"abc" xyz,d -> a,"""abc"" xyz",d + +Next: + 1. You can process the data without the flag -l/--lazy-quotes. + 2. Use 'csvtk del-quotes' if you want to restore the original format. + +Limitation: + 1. Values containing line breaks are not supported. + +`, + Run: func(cmd *cobra.Command, args []string) { + config := getConfigs(cmd) + files := getFileListFromArgsAndFile(cmd, args, true, "infile-list", true) + if len(files) > 1 { + checkError(fmt.Errorf("no more than one file should be given")) + } + runtime.GOMAXPROCS(config.NumCPUs) + + if config.Tabs { + config.Delimiter = '\t' + } + + outfh, err := xopen.Wopen(config.OutFile) + checkError(err) + defer outfh.Close() + + fh, err := xopen.Ropen(files[0]) + checkError(err) + defer func() { + checkError(fh.Close()) + }() + + var buf bytes.Buffer + + scanner := bufio.NewScanner(fh) + var line string + var i, s int + var r, p rune + var firstField, firstChar bool + var hasLeftQuotes, hasRightQuotes bool + var nInnerQuotes int // number of inner quotes, might including the right quotes + d := config.Delimiter + re := regexp.MustCompile(`"`) + var field string + var n, ncols int + ncols = -1 + var iLine int + var reQuotedDelimiter = regexp.MustCompile(fmt.Sprintf(`(^|%c)".*%c.*"($|%c)`, d, d, d)) + var hasQuotedDelimiter bool + for scanner.Scan() { + iLine++ + line = scanner.Text() + hasQuotedDelimiter = reQuotedDelimiter.MatchString(line) + + if len(line) == 0 || line[0] == byte(config.CommentChar) { + outfh.WriteString(line) + outfh.WriteByte('\n') + + continue + } + + n = 0 + firstField = true + + firstChar = true + nInnerQuotes, hasLeftQuotes, hasRightQuotes = 0, false, false + buf.Reset() + + s = 0 + + for i, r = range line { + if r == d { + if p == '"' { + hasRightQuotes = true + nInnerQuotes-- + } + + // might be a comma within a field + if hasLeftQuotes && !hasRightQuotes && hasQuotedDelimiter { + continue + } + + if firstField { + field = line[s:i] + } else { + field = line[s+1 : i] + } + + if nInnerQuotes > 0 || + (hasLeftQuotes && !hasRightQuotes) || + (!hasLeftQuotes && hasRightQuotes) { + field = re.ReplaceAllString(field, `""`) + field = `"` + field + `"` + } + + if !firstField { + buf.WriteRune(d) + } + buf.WriteString(field) + + s = i + + firstField = false + n++ + + firstChar = true + nInnerQuotes, hasLeftQuotes, hasRightQuotes = 0, false, false + + continue + } + + if firstChar { + if r == '"' { + hasLeftQuotes = true + } + firstChar = false + } else if r == '"' { + nInnerQuotes++ + } + p = r + + } + + i = len(line) + // the last record + + if p == '"' { + hasRightQuotes = true + nInnerQuotes-- + } + + if firstField { + field = line[s:i] + } else { + field = line[s+1 : i] + } + + if nInnerQuotes > 0 || + (hasLeftQuotes && !hasRightQuotes) || + (!hasLeftQuotes && hasRightQuotes) { + field = re.ReplaceAllString(field, `""`) + field = `"` + field + `"` + } + + if !firstField { + buf.WriteRune(d) + } + buf.WriteString(field) + + // the last record + + n++ + + buf.WriteByte('\n') + + outfh.Write(buf.Bytes()) + + // check ncols + if ncols < 0 { + ncols = n + } else if n != ncols { + checkError(fmt.Errorf("failed to fix (unequal number of fields: %d (line %d) != %d (line %d), does exist quoted delimiter?): %s", + n, iLine, ncols, iLine-1, line)) + + } + + } + if err := scanner.Err(); err != nil { + checkError(err) + } + }, +} + +func init() { + RootCmd.AddCommand(fixquotesCmd) +} diff --git a/csvtk/cmd/version.go b/csvtk/cmd/version.go index c0c7185..69b66d2 100644 --- a/csvtk/cmd/version.go +++ b/csvtk/cmd/version.go @@ -29,7 +29,7 @@ import ( ) // VERSION of csvtk -const VERSION = "0.28.1" +const VERSION = "0.29.0" // versionCmd represents the version command var versionCmd = &cobra.Command{ diff --git a/doc/docs/faq.md b/doc/docs/faq.md new file mode 100644 index 0000000..711252c --- /dev/null +++ b/doc/docs/faq.md @@ -0,0 +1,78 @@ +# Frequently Asked Questions + +## The specification of CSV format + +The CSV parser used by csvtk follows the [RFC4180](https://rfc-editor.org/rfc/rfc4180.html) specification. + +## bare " in non-quoted-field + +``` + 5. Each field may or may not be enclosed in double quotes (however + some programs, such as Microsoft Excel, do not use double quotes + at all). If fields are not enclosed with double quotes, then + double quotes may not appear inside the fields. For example: + + "aaa","bbb","ccc" CRLF + zzz,yyy,xxx + + 6. Fields containing line breaks (CRLF), double quotes, and commas + should be enclosed in double-quotes. For example: + + "aaa","b CRLF + bb","ccc" CRLF + zzz,yyy,xxx + + 7. If double-quotes are used to enclose fields, then a double-quote + appearing inside a field must be escaped by preceding it with + another double quote. For example: + + "aaa","b""bb","ccc" +``` + +If a single double-quote exists in one non-quoted-field, an error will be reported. e.g, + + $ echo 'a,abc" xyz,d' + a,abc" xyz,d + + $ echo 'a,abc" xyz,d' | csvtk cut -f 1- + [ERRO] parse error on line 1, column 6: bare " in non-quoted-field + +You can add the flag `-l/--lazy-quotes` to fix this. + + $ echo 'a,abc" xyz,d' | csvtk cut -f 1- -l + a,"abc"" xyz",d + +## extraneous or missing " in quoted-field + +But for the situation below, `-l/--lazy-quotes` won't help: + + $ echo 'a,"abc" xyz,d' + a,"abc" xyz,d + + $ echo 'a,"abc" xyz,d' | csvtk cut -f 1- + [ERRO] parse error on line 1, column 7: extraneous or missing " in quoted-field + + $ echo 'a,"abc" xyz,d' | csvtk cut -f 1- -l + a,"abc"" xyz,d + " + + $ echo 'a,"abc" xyz,d' | csvtk cut -f 1- -l | csvtk dim + file num_cols num_rows + - 2 0 + +**You need to use [csvtk fix-quotes](https://bioinf.shenwei.me/csvtk/usage/#fix-quotes) (available in v0.29.0 or later versions)**: + + $ echo 'a,"abc" xyz,d' | csvtk fix-quotes + a,"""abc"" xyz",d + + $ echo 'a,"abc" xyz,d' | csvtk fix-quotes | csvtk cut -f 1- + a,"""abc"" xyz",d + + $ echo 'a,"abc" xyz,d' | csvtk fix-quotes | csvtk cut -f 1- | csvtk dim + file num_cols num_rows + - 3 0 + +Use [del-quotes](https://bioinf.shenwei.me/csvtk/usage/#del-quotes) if you need the original format after some operations. + + $ echo 'a,"abc" xyz,d' | csvtk fix-quotes | csvtk cut -f 1- | csvtk del-quotes + a,"abc" xyz,d diff --git a/doc/docs/usage.md b/doc/docs/usage.md index 1896915..ecf02a4 100644 --- a/doc/docs/usage.md +++ b/doc/docs/usage.md @@ -100,7 +100,7 @@ Usage ```text csvtk -- a cross-platform, efficient and practical CSV/TSV toolkit -Version: 0.28.0 +Version: 0.29.0 Author: Wei Shen @@ -145,10 +145,12 @@ Available Commands: csv2xlsx convert CSV/TSV files to XLSX file cut select and arrange fields del-header delete column names + del-quotes remove extra double quotes added by 'fix-quotes' dim dimensions of CSV file filter filter rows by values of selected fields with arithmetic expression filter2 filter rows by awk-like arithmetic/string expressions fix fix CSV/TSV with different numbers of columns in rows + fix-quotes fix malformed CSV/TSV caused by double-quotes fmtdate format date of selected fields fold fold multiple values of a field into cells of groups freq frequencies of selected fields @@ -207,7 +209,7 @@ Flags: -Z, --show-row-number show row number as the first column, with header row skipped -t, --tabs specifies that the input CSV file is delimited with tabs. Overrides "-d" -Use "csvtk [command] --help" for more information about a command +Use "csvtk [command] --help" for more information about a command. ``` ## headers @@ -2703,6 +2705,139 @@ $ cat testdata/unequal_ncols.csv | csvtk fix | csvtk pretty -S grid ``` +## fix-quotes + +Usage + +```text +fix malformed CSV/TSV caused by double-quotes + +This command fixes fields not appropriately enclosed by double-quotes +to meet the RFC4180 standard (https://rfc-editor.org/rfc/rfc4180.html). + +When and how to: + 1. Values containing bare double quotes. e.g., + a,abc" xyz,d + Error information: bare " in non-quoted-field. + Fix: adding the flag -l/--lazy-quotes. + Using this command: + a,abc" xyz,d -> a,"abc"" xyz",d + 2. Values with double quotes in the begining but not in the end. e.g., + a,"abc" xyz,d + Error information: extraneous or missing " in quoted-field. + Using this command: + a,"abc" xyz,d -> a,"""abc"" xyz",d + +Next: + 1. You can process the data without the flag -l/--lazy-quotes. + 2. Use 'csvtk del-quotes' if you want to restore the original format. + +Limitation: + 1. Values containing line breaks are not supported. + +Usage: + csvtk fix-quotes [flags] + +Flags: + -h, --help help for fix-quotes +``` + +Examples: + +1. Test data, in which there are five cases with values containing double quotes. + + $ cat testdata/malformed.tsv + 1 Cellvibrio no quotes & not tab + 2 "Cellvibrio gilvus" quotes can be removed + 3 "quotes required" quotes needed (with a tab in the cell) + 4 fake" record bare double-quote in non-quoted-field + 5 "Cellvibrio" Winogradsky only with doub-quote in the beginning + 6 fake record2" "only with doub-quote in the end" + + $ cat testdata/malformed.tsv | csvtk cut -f 1- + [ERRO] parse error on line 2, column 3: bare " in non-quoted-field + + # -l does not work, and it's messed up. + $ cat testdata/malformed.tsv | csvtk cut -f 1- -l + 1 Cellvibrio no quotes & not tab + "2 ""Cellvibrio gilvus"" quotes can be removed" + "3 ""quotes required"" quotes needed (with a tab in the cell)" + "4 fake"" record bare double-quote in non-quoted-field" + "5 ""Cellvibrio"" Winogradsky only with doub-quote in the beginning" + "6 fake record2"" ""only with doub-quote in the end""" + +1. Fix it!!! + + $ cat testdata/malformed.tsv | csvtk fix-quotes -t + 1 Cellvibrio no quotes & not tab + 2 "Cellvibrio gilvus" quotes can be removed + 3 "quotes required" quotes needed (with a tab in the cell) + 4 "fake"" record" bare double-quote in non-quoted-field + 5 """Cellvibrio"" Winogradsky" only with doub-quote in the beginning + 6 "fake record2""" "only with doub-quote in the end" + + # pretty + $ cat testdata/malformed.tsv | csvtk fix-quotes -t | csvtk pretty -Ht -S grid + +---+--------------------------+----------------------------------------+ + | 1 | Cellvibrio | no quotes & not tab | + +---+--------------------------+----------------------------------------+ + | 2 | Cellvibrio gilvus | quotes can be removed | + +---+--------------------------+----------------------------------------+ + | 3 | quotes required | quotes needed (with a tab in the cell) | + +---+--------------------------+----------------------------------------+ + | 4 | fake" record | bare double-quote in non-quoted-field | + +---+--------------------------+----------------------------------------+ + | 5 | "Cellvibrio" Winogradsky | only with doub-quote in the beginning | + +---+--------------------------+----------------------------------------+ + | 6 | fake record2" | only with doub-quote in the end | + +---+--------------------------+----------------------------------------+ + + # do something, like searching rows containing double-quotes. + # since the command-line argument parser csvtk uses parse the value of flag -p + # as CSV data, we have to use -p '""""' to represents one double-quotes, + # where the outter two double quotes are used to quote the value, + # and the two inner double-quotes actually means an escaped double-quote + # + $ cat testdata/malformed.tsv \ + | csvtk fix-quotes -t \ + | csvtk grep -Ht -f 2 -r -p '""""' + 4 "fake"" record" bare double-quote in non-quoted-field + 5 """Cellvibrio"" Winogradsky" only with doub-quote in the beginning + 6 "fake record2""" only with doub-quote in the end + +1. Note that fixed rows are different from the orginal ones, you can use `csvtk del-quotes` to reset them. + + $ cat testdata/malformed.tsv \ + | csvtk fix-quotes -t \ + | csvtk filter2 -t -f '$1 > 0' \ + | csvtk del-quotes -t + 1 Cellvibrio no quotes & not tab + 2 Cellvibrio gilvus quotes can be removed + 3 "quotes required" quotes needed (with a tab in the cell) + 4 fake" record bare double-quote in non-quoted-field + 5 "Cellvibrio" Winogradsky only with doub-quote in the beginning + 6 fake record2" only with doub-quote in the end + + +## del-quotes + +Usage + +```text +remove extra double quotes added by 'fix-quotes' + +Limitation: + 1. Values containing line breaks are not supported. + +Usage: + csvtk del-quotes [flags] + +Flags: + -h, --help help for del-quotes +``` + +Examples: see eamples of [fix-quotes](#fix-quotes) + ## add-header Usage diff --git a/doc/mkdocs.yml b/doc/mkdocs.yml index 8b3c856..e2561d1 100644 --- a/doc/mkdocs.yml +++ b/doc/mkdocs.yml @@ -3,6 +3,7 @@ nav: - Home: index.md - Download: download.md - Usage: usage.md +- FAQs: faq.md - Tutorial: tutorial.md - 中文介绍: chinese.md - More tools: https://github.com/shenwei356 diff --git a/go.mod b/go.mod index e9a849f..26c9c6a 100644 --- a/go.mod +++ b/go.mod @@ -16,9 +16,9 @@ require ( github.com/shenwei356/breader v0.3.2 github.com/shenwei356/go-logging v0.0.0-20171012171522-c6b9702d88ba github.com/shenwei356/natsort v0.0.0-20220117010048-580176ad49fb - github.com/shenwei356/stable v0.1.4 + github.com/shenwei356/stable v0.1.5 github.com/shenwei356/util v0.5.2 - github.com/shenwei356/xopen v0.2.2 + github.com/shenwei356/xopen v0.3.1 github.com/spf13/cobra v1.7.0 github.com/tatsushid/go-prettytable v0.0.0-20141013043238-ed2d14c29939 github.com/twotwotwo/sorts v0.0.0-20160814051341-bf5c1f2b8553 @@ -31,6 +31,7 @@ require ( git.sr.ht/~sbinet/gg v0.3.1 // indirect github.com/VividCortex/ewma v1.2.0 // indirect github.com/ajstarks/svgo v0.0.0-20211024235047-1546f124cd8b // indirect + github.com/dsnet/compress v0.0.1 // indirect github.com/dustin/go-humanize v1.0.1 // indirect github.com/go-fonts/liberation v0.3.0 // indirect github.com/go-latex/latex v0.0.0-20230307184459-12ec69307ad9 // indirect diff --git a/go.sum b/go.sum index 2657e38..4f08b63 100644 --- a/go.sum +++ b/go.sum @@ -29,6 +29,9 @@ github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ3 github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/dsnet/compress v0.0.1 h1:PlZu0n3Tuv04TzpfPbrnI0HW/YwodEXDS+oPKahKF0Q= +github.com/dsnet/compress v0.0.1/go.mod h1:Aw8dCMJ7RioblQeTqt88akK31OvO8Dhf5JflhBbQEHo= +github.com/dsnet/golib v0.0.0-20171103203638-1ea166775780/go.mod h1:Lj+Z9rebOhdfkVLjJ8T6VcRQv3SXugXy999NBtR9aFY= github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY= github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto= github.com/fatih/color v1.10.0/go.mod h1:ELkj/draVOlAH/xkhN6mQ50Qd0MPOk5AAr3maGEBuJM= @@ -67,9 +70,11 @@ github.com/jung-kurt/gofpdf v1.0.0/go.mod h1:7Id9E/uU8ce6rXgefFLlgrJj/GYY22cpxn+ github.com/jung-kurt/gofpdf v1.0.3-0.20190309125859-24315acbbda5/go.mod h1:7Id9E/uU8ce6rXgefFLlgrJj/GYY22cpxn+r32jIOes= github.com/jung-kurt/gofpdf v1.16.2/go.mod h1:1hl7y57EsiPAkLbOwzpzqgx1A30nQCk/YmFV8S2vmK0= github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= +github.com/klauspost/compress v1.4.1/go.mod h1:RyIbtBH6LamlWaDj8nUwkbUhJ87Yi3uG0guNDohfE1A= github.com/klauspost/compress v1.15.1/go.mod h1:/3/Vjq9QcHkK5uEr5lBEmyoZ1iFhe47etQ6QUkpK6sk= github.com/klauspost/compress v1.15.12 h1:YClS/PImqYbn+UILDnqxQCZ3RehC9N318SU3kElDUEM= github.com/klauspost/compress v1.15.12/go.mod h1:QPwzmACJjUTFsnSHH934V6woptycfrDDJnH7hvFVbGM= +github.com/klauspost/cpuid v1.2.0/go.mod h1:Pj4uuM528wm8OyEC2QMXAi2YiTZ96dNQPGgoMS4s3ek= github.com/klauspost/pgzip v1.2.5/go.mod h1:Ch1tH69qFZu15pkjo5kYi6mth2Zzwzt50oCQKQE9RUs= github.com/klauspost/pgzip v1.2.6 h1:8RXeL5crjEUFnR2/Sn6GJNWtSQ3Dk8pq4CL3jvdDyjU= github.com/klauspost/pgzip v1.2.6/go.mod h1:Ch1tH69qFZu15pkjo5kYi6mth2Zzwzt50oCQKQE9RUs= @@ -132,12 +137,13 @@ github.com/shenwei356/go-logging v0.0.0-20171012171522-c6b9702d88ba/go.mod h1:Li github.com/shenwei356/natsort v0.0.0-20190418160752-600d539c017d/go.mod h1:SiiGiRFyRtV7S9RamOrmQR5gpGIRhWJM1w0EtmuQ1io= github.com/shenwei356/natsort v0.0.0-20220117010048-580176ad49fb h1:pb0RhpaADsFrKNLST9oogHPlZJec7vT4Gvny5FFhaxU= github.com/shenwei356/natsort v0.0.0-20220117010048-580176ad49fb/go.mod h1:SiiGiRFyRtV7S9RamOrmQR5gpGIRhWJM1w0EtmuQ1io= -github.com/shenwei356/stable v0.1.4 h1:cQL/I2pBpLEH8UWNo0SE3/q/e3XwzFN2NUnpECzwDoU= -github.com/shenwei356/stable v0.1.4/go.mod h1:KghgqlviHPiKn9AuSTpadb7ep74n42VsNtPLoZZ/JIc= +github.com/shenwei356/stable v0.1.5 h1:d6VivPq2YOzQkTve6tW9FxHeiTrZtqV4rcGBC39aCIM= +github.com/shenwei356/stable v0.1.5/go.mod h1:KghgqlviHPiKn9AuSTpadb7ep74n42VsNtPLoZZ/JIc= github.com/shenwei356/util v0.5.2 h1:kU9bnkE3RRUAlya+hbfwy83iTMOJqIHOlYgejYPb7mU= github.com/shenwei356/util v0.5.2/go.mod h1:3tRAOfreWdgl/Zh1gE008h2lWocf5/YAxVSjgLKvd4k= -github.com/shenwei356/xopen v0.2.2 h1:g1v3YjiIky9k6oN4qmnU1bDciAHnSrmOn2sMTE5pChY= github.com/shenwei356/xopen v0.2.2/go.mod h1:6EQUa6I7Zsl2GQKqcL9qGLrTzVE+oZyly+uhzovQYSk= +github.com/shenwei356/xopen v0.3.1 h1:3pju0hVeRRnlpXC7s3aE/RVOhFB1S3qRJGN+eV85r3s= +github.com/shenwei356/xopen v0.3.1/go.mod h1:6EQUa6I7Zsl2GQKqcL9qGLrTzVE+oZyly+uhzovQYSk= github.com/spf13/cobra v1.4.0/go.mod h1:Wo4iy3BUC+X2Fybo0PDqwJIv3dNRiZLHQymsfxlB84g= github.com/spf13/cobra v1.7.0 h1:hyqWnYt1ZQShIddO5kBpj3vu05/++x6tJ6dg8EC572I= github.com/spf13/cobra v1.7.0/go.mod h1:uLxZILRyS/50WlhOIKD7W6V5bgeIt+4sICxh6uRMrb0= @@ -154,6 +160,7 @@ github.com/tatsushid/go-prettytable v0.0.0-20141013043238-ed2d14c29939 h1:BhIUXV github.com/tatsushid/go-prettytable v0.0.0-20141013043238-ed2d14c29939/go.mod h1:omGxs4/6hNjxPKUTjmaNkPzehSnNJOJN6pMEbrlYIT4= github.com/twotwotwo/sorts v0.0.0-20160814051341-bf5c1f2b8553 h1:DRC1ubdb3ZmyyIeCSTxjZIQAnpLPfKVgYrLETQuOPjo= github.com/twotwotwo/sorts v0.0.0-20160814051341-bf5c1f2b8553/go.mod h1:Rj7Csq/tZ/egz+Ltc2IVpsA5309AmSMEswjkTZmq2Xc= +github.com/ulikunitz/xz v0.5.6/go.mod h1:2bypXElzHzzJZwzH67Y6wb67pO62Rzfn7BSiF4ABRW8= github.com/ulikunitz/xz v0.5.10 h1:t92gobL9l3HE202wg3rlk19F6X+JOxl9BBrCCMYEYd8= github.com/ulikunitz/xz v0.5.10/go.mod h1:nbz6k7qbPmH4IRqmfOplQw/tblSgqTqBwxkY0oWt/14= github.com/xuri/efp v0.0.0-20220603152613-6918739fd470 h1:6932x8ltq1w4utjmfMPVj09jdMlkY0aiA6+Skbtl3/c= diff --git a/testdata/malformed.tsv b/testdata/malformed.tsv new file mode 100644 index 0000000..3409cf4 --- /dev/null +++ b/testdata/malformed.tsv @@ -0,0 +1,6 @@ +1 Cellvibrio no quotes & not tab +2 "Cellvibrio gilvus" quotes can be removed +3 "quotes required" quotes needed (with a tab in the cell) +4 fake" record bare double-quote in non-quoted-field +5 "Cellvibrio" Winogradsky only with doub-quote in the beginning +6 fake record2" "only with doub-quote in the end"