Skip to content

Commit

Permalink
Optionally convert numbers in CSV files (#65)
Browse files Browse the repository at this point in the history
* Bump datastation/runner for new `ContentTypeInfo.ConvertNumbers` flag

* Expose `convertNumbers` flag as command line argument

* Add documentation for the `--convert-numbers` flag
  • Loading branch information
fritzgrabo authored Jun 11, 2022
1 parent 6dfe57c commit cfd934b
Show file tree
Hide file tree
Showing 4 changed files with 64 additions and 14 deletions.
41 changes: 41 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -481,6 +481,47 @@ dsq> SELECT * FROM {} WHERE NAME = 'Kevin';
(0 rows)
```
### Converting numbers in CSV and TSV files
CSV and TSV files do not allow to specify the type of the individual
values contained in them. All values are treated as strings by default.
This can lead to unexpected results in queries. Consider the following
example:
```
$ cat scores.csv
name,score
Fritz,90
Rainer,95.2
Fountainer,100
$ dsq scores.csv "SELECT * FROM {} ORDER BY score"
[{"name":"Fountainer","score":"100"},
{"name":"Fritz","score":"90"},
{"name":"Rainer","score":"95.2"}]
```
Note how the `score` column contains numerical values only. Still,
sorting by that column yields unexpected results because the values are
treated as strings, and sorted lexically. (You can tell that the
individual scores were imported as strings because they're quoted in the
JSON result.)
Use the `-n` or `--convert-numbers` flag to auto-detect and convert
numerical values (integers and floats) in imported files:
```
$ dsq ~/scores.csv --convert-numbers "SELECT * FROM {} ORDER BY score"
[{"name":"Fritz","score":90},
{"name":"Rainer","score":95.2},
{"name":"Fountainer","score":100}]
```
Note how the scores are imported as numbers now and how the records in
the result set are sorted by their numerical value. Also note that the
individual scores are no longer quoted in the JSON result.
## Supported Data Types
| Name | File Extension(s) | Mime Type | Notes |
Expand Down
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ go 1.18
require (
github.com/chzyer/readline v1.5.0
github.com/google/uuid v1.3.0
github.com/multiprocessio/datastation/runner v0.0.0-20220601032709-9bda16b723bb
github.com/multiprocessio/datastation/runner v0.0.0-20220609232347-405d8c1a88b2
github.com/olekukonko/tablewriter v0.0.5
)

Expand Down
2 changes: 2 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -447,6 +447,8 @@ github.com/mohae/deepcopy v0.0.0-20170929034955-c48cc78d4826 h1:RWengNIwukTxcDr9
github.com/mohae/deepcopy v0.0.0-20170929034955-c48cc78d4826/go.mod h1:TaXosZuwdSHYgviHp1DAtfrULt5eUgsSMsZf+YrPgl8=
github.com/multiprocessio/datastation/runner v0.0.0-20220601032709-9bda16b723bb h1:sG23Q6XOfcOtK9bM4QhcmGiqsFVzoXwkZRvb8OJ3EiU=
github.com/multiprocessio/datastation/runner v0.0.0-20220601032709-9bda16b723bb/go.mod h1:UCms/xK08DspNqDDZ5XsaIlc39AuREmeELspFYghMGI=
github.com/multiprocessio/datastation/runner v0.0.0-20220609232347-405d8c1a88b2 h1:WWCPwJPWfBVUhuAfFZJGs6vxemeeqW8ahDRtTtbGyxw=
github.com/multiprocessio/datastation/runner v0.0.0-20220609232347-405d8c1a88b2/go.mod h1:UCms/xK08DspNqDDZ5XsaIlc39AuREmeELspFYghMGI=
github.com/multiprocessio/go-json v0.0.0-20220308002443-61d497dd7b9e h1:NlPl7amllnQyVAkZgjBvFEkKxJSba/R8ZpaTodc7SIQ=
github.com/multiprocessio/go-json v0.0.0-20220308002443-61d497dd7b9e/go.mod h1:huI4M/MrI5px/SgmXYi0a2byKikSLgDrnMQuXOqKtw4=
github.com/multiprocessio/go-openoffice v0.0.0-20220110232726-064f5dda1956 h1:WVofL03Eq+z3LbDOfH5eKzu2U85LFZZngOMBlNaO/H0=
Expand Down
33 changes: 20 additions & 13 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ func resolveContentType(fileExtensionOrContentType string) runner.MimeType {
return runner.GetMimeType("x."+fileExtensionOrContentType, runner.ContentTypeInfo{})
}

func evalFileInto(file, mimetype string, out *os.File) error {
func evalFileInto(file, mimetype string, convertNumbers bool, out *os.File) error {
if mimetype == "" {
mimetype = string(runner.GetMimeType(file, runner.ContentTypeInfo{}))
} else {
Expand All @@ -55,7 +55,8 @@ func evalFileInto(file, mimetype string, out *os.File) error {
defer w.Flush()

return runner.TransformFile(file, runner.ContentTypeInfo{
Type: mimetype,
Type: mimetype,
ConvertNumbers: convertNumbers,
}, w)
}

Expand Down Expand Up @@ -213,7 +214,7 @@ func getFilesContentHash(files []string) (string, error) {
return hex.EncodeToString(sha1.Sum(nil)), nil
}

func importFile(projectId string, file, mimetype string, ec runner.EvalContext) (*runner.PanelInfo, error) {
func importFile(projectId string, file, mimetype string, convertNumbers bool, ec runner.EvalContext) (*runner.PanelInfo, error) {
panelId := uuid.New().String()
resultFile := ec.GetPanelResultsFile(projectId, panelId)
out, err := openTruncate(resultFile)
Expand All @@ -222,7 +223,7 @@ func importFile(projectId string, file, mimetype string, ec runner.EvalContext)
}
defer out.Close()

if err := evalFileInto(file, mimetype, out); err != nil {
if err := evalFileInto(file, mimetype, convertNumbers, out); err != nil {
return nil, err
}

Expand Down Expand Up @@ -333,14 +334,15 @@ func repl(project *runner.ProjectState, ec *runner.EvalContext, args *args, file
}

type args struct {
pipedMimetype string
pretty bool
schema bool
sqlFile string
cacheSettings runner.CacheSettings
nonFlagArgs []string
dumpCacheFile bool
isInteractive bool
pipedMimetype string
pretty bool
schema bool
sqlFile string
cacheSettings runner.CacheSettings
nonFlagArgs []string
dumpCacheFile bool
isInteractive bool
convertNumbers bool
}

func getArgs() (*args, error) {
Expand Down Expand Up @@ -415,6 +417,11 @@ func getArgs() (*args, error) {
continue
}

if arg == "-n" || arg == "--convert-numbers" {
args.convertNumbers = true
continue
}

args.nonFlagArgs = append(args.nonFlagArgs, arg)
}

Expand Down Expand Up @@ -558,7 +565,7 @@ func _main() error {
// When dumping schema, need to injest even if cache is on.
if !args.cacheSettings.CachePresent || !args.cacheSettings.Enabled || lastNonFlagArg == "" {
for _, file := range files {
panel, err := importFile(project.Id, file, mimetypeOverride[file], ec)
panel, err := importFile(project.Id, file, mimetypeOverride[file], args.convertNumbers, ec)
if err != nil {
return err
}
Expand Down

0 comments on commit cfd934b

Please sign in to comment.