Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

jsonl merger tool in go #1

Open
wants to merge 8 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
*.exe
*.exe~
*.dll
*.so
*.dylib
bin/
input/
output/
# Test binary, built with `go test -c`
*.test

# Output of the go coverage tool, specifically when used with LiteIDE
*.out

# Dependency directories (remove the comment below to include it)
# vendor/

# Go workspace file
go.work
go.work.sum

# env file
.env
13 changes: 13 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
FROM golang:1.20

WORKDIR /training-ut-util-go

COPY . .

RUN go mod tidy

WORKDIR /training-ut-util-go/scripts

RUN chmod +x /training-ut-util-go/scripts/docker_entrypoint.sh

CMD ["/training-ut-util-go/scripts/docker_entrypoint.sh"]
13 changes: 12 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,18 @@ This repository contains a Go CLI that performs file merging operations. The CLI

**Objective:** Create a Go CLI tool named `util` that performs file merging operations. The tool should merge all JSONL files in a specified directory into a single JSONL file and store it in an output directory. Additionally, if the `--format csv` flag is used, the output should be a CSV file.

## How to run the Tool
- cd scripts
- chmod +x build.sh
- ./build.sh --operation merge --input-path [your input-path] --output-path [your output-path]
Then follow the terminal guidelines

## How to run the tool in a dockerized enviroment
- cd scripts
- chmod +x docker_entrypoint.sh
- cd ..
- docker-compose up --build

## Requirements

1. **CLI Flags:**
Expand Down Expand Up @@ -59,7 +71,6 @@ This repository contains a Go CLI that performs file merging operations. The CLI
```bash
./bin/util --operation merge --input-path ./data/jsonl_files/ --output-path ./output_data --format csv
```

## Submission

Submit the following:
Expand Down
16 changes: 16 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
version: '3.8'

services:
go-app:
build:
context: .
dockerfile: Dockerfile
container_name: go_app_container
volumes:
- .:/training-ut-util-go
- ./input:/training-ut-util-go/input
- ./output:/training-ut-util-go/output
working_dir: /training-ut-util-go
environment:
INPUT_PATH: /training-ut-util-go/input
OUTPUT_PATH: /training-ut-util-go/output
3 changes: 3 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
module github.com/datumbrain/training-ut-util-go

go 1.20
6 changes: 6 additions & 0 deletions input/output(1).jsonl
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This input/ as well as any other folder like output/ should be in .gitignore.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{"id": 1, "name": "Alice Johnson", "email": "[email protected]", "signup_date": "2023-01-15"}
{"id": 2, "name": "Bob Smith", "email": "[email protected]", "signup_date": "2023-03-22"}
{"id": 3, "name": "Charlie Brown", "email": "[email protected]", "signup_date": "2023-05-10"}
{"id": 4, "name": "Diana Prince", "email": "[email protected]", "signup_date": "2023-06-05"}
{"id": 5, "name": "Eve Adams", "email": "[email protected]", "signup_date": "2023-07-01"}

6 changes: 6 additions & 0 deletions input/output.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{"title": "To Kill a Mockingbird", "author": "Harper Lee", "year": 1960, "genre": "Fiction"}
{"title": "1984", "author": "George Orwell", "year": 1949, "genre": "Dystopian"}
{"title": "The Great Gatsby", "author": "F. Scott Fitzgerald", "year": 1925, "genre": "Fiction"}
{"title": "Pride and Prejudice", "author": "Jane Austen", "year": 1813, "genre": "Romance"}
{"title": "The Catcher in the Rye", "author": "J.D. Salinger", "year": 1951, "genre": "Fiction"}

11 changes: 11 additions & 0 deletions output/output.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
title,author,year,genre,id,name,email,signup_date
,,,,1,Alice Johnson,[email protected],2023-01-15
,,,,2,Bob Smith,[email protected],2023-03-22
,,,,3,Charlie Brown,[email protected],2023-05-10
,,,,4,Diana Prince,[email protected],2023-06-05
,,,,5,Eve Adams,[email protected],2023-07-01
To Kill a Mockingbird,Harper Lee,1960,Fiction,,,,
1984,George Orwell,1949,Dystopian,,,,
The Great Gatsby,F. Scott Fitzgerald,1925,Fiction,,,,
Pride and Prejudice,Jane Austen,1813,Romance,,,,
The Catcher in the Rye,J.D. Salinger,1951,Fiction,,,,
10 changes: 10 additions & 0 deletions output/output.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
{"id": 1, "name": "Alice Johnson", "email": "[email protected]", "signup_date": "2023-01-15"}
{"id": 2, "name": "Bob Smith", "email": "[email protected]", "signup_date": "2023-03-22"}
{"id": 3, "name": "Charlie Brown", "email": "[email protected]", "signup_date": "2023-05-10"}
{"id": 4, "name": "Diana Prince", "email": "[email protected]", "signup_date": "2023-06-05"}
{"id": 5, "name": "Eve Adams", "email": "[email protected]", "signup_date": "2023-07-01"}
{"title": "To Kill a Mockingbird", "author": "Harper Lee", "year": 1960, "genre": "Fiction"}
{"title": "1984", "author": "George Orwell", "year": 1949, "genre": "Dystopian"}
{"title": "The Great Gatsby", "author": "F. Scott Fitzgerald", "year": 1925, "genre": "Fiction"}
{"title": "Pride and Prejudice", "author": "Jane Austen", "year": 1813, "genre": "Romance"}
{"title": "The Catcher in the Rye", "author": "J.D. Salinger", "year": 1951, "genre": "Fiction"}
57 changes: 57 additions & 0 deletions scripts/build.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
#!/bin/bash

usage(){
echo -e "Press 1 to merge the JSONL files into a separate file\n"
echo -e "Press 2 to merge the JSONL files into a CSV\n"
}

operation=""
input_path=""
output_path=""
format="jsonl"
build=false

# Parse command-line arguments
while [[ $# -gt 0 ]]; do
case "$1" in
--operation)
operation="$2"
shift 2 # Move past the argument and its value
;;
--input-path)
input_path="$2"
shift 2
;;
--output-path)
output_path="$2"
shift 2
;;
--format)
format="$2"
shift 2
;;
-b)
build=true
shift
;;
*)
echo "Invalid option: $1"
;;
esac
done

if [ "$build" == true ]; then
cd ..
mkdir -p bin
go build -o bin/util ./util.go
else
usage
read -r input
if [ "$input" == 1 ]; then
cd ..
go run util.go --operation merge --input-path "$input_path" --output-path "$output_path"
elif [ "$input" == 2 ]; then
cd ..
go run util.go --operation merge --input-path "$input_path" --output-path "$output_path" --format csv
fi
fi
4 changes: 4 additions & 0 deletions scripts/docker_entrypoint.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#!/bin/bash
go run util.go --operation merge --input-path /training-ut-util-go/input --output-path /training-ut-util-go/output

go run util.go --operation merge --input-path /training-ut-util-go/input --output-path /training-ut-util-go/output --format csv
158 changes: 158 additions & 0 deletions util.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
package main

import (
"bufio"
"encoding/csv"
"encoding/json"
"flag"
"fmt"
"os"
"path/filepath"
)

func main() {
var operation string
var inputPath string
var outputPath string
var format string

flag.StringVar(&operation, "operation", "", "Specify the operation")
flag.StringVar(&inputPath, "input-path", "", "Specify the Input Path")
flag.StringVar(&outputPath, "output-path", "", "Specify the Output Path")
flag.StringVar(&format, "format", "", "Specify the format")
flag.Parse()

if operation != "merge" {
fmt.Println("Only merge operation is allowed")
return
}

if format != "" && format != "csv" {
fmt.Println("Only CSV format is allowed")
return
}

files, err := filepath.Glob(filepath.Join(inputPath, "*.jsonl"))
if err != nil {
fmt.Printf("Error finding .jsonl files: %v\n", err)
return
}

if format == "csv" {
csvFilename := filepath.Join(outputPath, "output.csv")
if err := convertToCSV(files, csvFilename); err != nil {
fmt.Printf("Error converting to CSV: %v\n", err)
return
}
fmt.Printf("CSV data saved to: %s\n", csvFilename)
} else {
outputFilename := filepath.Join(outputPath, "output.jsonl")
outputFile, err := os.OpenFile(outputFilename, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644)
if err != nil {
fmt.Printf("Error opening output file: %v\n", err)
return
}
defer outputFile.Close()
writer := bufio.NewWriter(outputFile)

for _, file := range files {
fmt.Printf("Merging file: %v\n", file)

f, err := os.Open(file)
if err != nil {
fmt.Printf("Error Opening File %s: %v\n", file, err)
continue
}
defer f.Close()

scanner := bufio.NewScanner(f)
for scanner.Scan() {
line := scanner.Text()
if len(line) == 0 {
continue // Skip empty lines
}
_, err := writer.WriteString(line + "\n")
if err != nil {
fmt.Printf("Error writing to output file: %v\n", err)
return
}
}
if err := scanner.Err(); err != nil {
fmt.Printf("Error reading file %s: %v\n", file, err)
}
}

writer.Flush()
fmt.Printf("Merged data saved to: %s\n", outputFilename)
}
}

func convertToCSV(jsonlFiles []string, csvFilename string) error {
records := []map[string]interface{}{} //A slice of maps to store each JSON object read from the JSONL files
fieldSet := map[string]bool{} //A map to keep track of all unique fields(headers)

// Reading and Parsin JSONL file
for _, file := range jsonlFiles {
jsonlFile, err := os.Open(file)
if err != nil {
return fmt.Errorf("error opening JSONL file %s: %v", file, err)
}
defer jsonlFile.Close()

scanner := bufio.NewScanner(jsonlFile)
for scanner.Scan() {
line := scanner.Text()
if len(line) == 0 {
continue // Skip empty lines
}
record := map[string]interface{}{}
err := json.Unmarshal([]byte(line), &record)
if err != nil {
return fmt.Errorf("error parsing JSON in file %s: %v", file, err)
}
records = append(records, record)
for key := range record {
fieldSet[key] = true
}
}
if err := scanner.Err(); err != nil {
return fmt.Errorf("error reading file %s: %v", file, err)
}
}

fields := []string{}
for field := range fieldSet {
fields = append(fields, field)
}

csvFile, err := os.Create(csvFilename)
if err != nil {
return fmt.Errorf("error creating CSV file: %v", err)
}
defer csvFile.Close()

writer := csv.NewWriter(csvFile)
defer writer.Flush()

if err := writer.Write(fields); err != nil {
return fmt.Errorf("error writing header to CSV: %v", err)
}

// Writing records to the CSV file

for _, record := range records {
row := make([]string, len(fields))
for i, field := range fields {
if val, ok := record[field]; ok {
row[i] = fmt.Sprintf("%v", val)
} else {
row[i] = ""
}
}
if err := writer.Write(row); err != nil {
return fmt.Errorf("error writing record to CSV: %v", err)
}
}

return nil
}