Skip to content

Commit

Permalink
Extract CSV data importer functionality to re-usable component
Browse files Browse the repository at this point in the history
  • Loading branch information
your committed Nov 27, 2023
1 parent 18032c8 commit 9a92d2d
Show file tree
Hide file tree
Showing 7 changed files with 390 additions and 0 deletions.
1 change: 1 addition & 0 deletions config/application.rb
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
# require "rails/test_unit/railtie"

require_relative "../lib/student_loan"
require_relative "../lib/csv_importer"
require_relative "../lib/dfe_sign_in"
require_relative "../lib/hmrc"
require_relative "../lib/ordnance_survey"
Expand Down
6 changes: 6 additions & 0 deletions lib/csv_importer.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
module CsvImporter
end

require_relative "csv_importer/config"
require_relative "csv_importer/errors"
require_relative "csv_importer/base"
71 changes: 71 additions & 0 deletions lib/csv_importer/base.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
require "csv"

module CsvImporter
class Base
include CsvImporter::Config

attr_reader :errors, :rows

def initialize(file)
@errors = []
@rows = parse_csv_file(file)

check_headers if rows && with_headers?
end

def run
target_data_model.delete_all unless append_only

rows.each_slice(batch_size).with_index(1) do |batch_rows, i|
Rails.logger.info "Processing #{target_data_model.to_s.titleize} batch #{i}"

record_hashes = batch_rows.map do |row|
next if valid_skip_row_conditions?(row)

convert_row_to_hash(row)
end.compact

target_data_model.insert_all(record_hashes) unless record_hashes.empty?
end
end

private

def with_headers?
parse_headers && mandatory_headers&.is_a?(Array) && mandatory_headers&.any?
end

def check_headers
missing_headers = mandatory_headers - rows.headers

if missing_headers.any?
errors.append("The selected file is missing some expected columns: #{missing_headers.join(", ")}")
end
end

def parse_csv_file(file)
if file.nil?
errors.append("Select a file")
nil
else
CSV.read(file.to_io, headers: parse_headers, encoding: "BOM|UTF-8")
end
rescue CSV::MalformedCSVError
errors.append("The selected file must be a CSV")
nil
end

def valid_skip_row_conditions?(row)
return false unless skip_row_if_method || skip_row_if_lambda
return method(skip_row_if_method).call(row) if skip_row_if_method

skip_row_if_lambda&.call(row)
end

def convert_row_to_hash(row)
return method(transform_rows_with_method).call(row) if transform_rows_with_method

transform_rows_with_lambda&.call(row)
end
end
end
67 changes: 67 additions & 0 deletions lib/csv_importer/config.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
module CsvImporter
module Config
extend ActiveSupport::Concern

included do
class_attribute :target_data_model
class_attribute :append_only
class_attribute :parse_headers
class_attribute :mandatory_headers
class_attribute :batch_size
class_attribute :transform_rows_with_method
class_attribute :transform_rows_with_lambda
class_attribute :skip_row_if_method
class_attribute :skip_row_if_lambda
end

DEFAULT_BATCH_SIZE = 500
DEFAULT_ROW_TRANSFORM_LAMBDA = ->(row) { row&.to_h }

class_methods do
#
# @param options [Hash] options to be used in the configuration
# @option options [Class] :target_data_model the target data model
# @option options [Boolean] :append_only whether to append rows without purging first (defaults to `false`)
# @option options [Proc] :transform_rows_with the lambda to execute to transform each row (it can also be a method name)
# @option options [Proc] :skip_row_if the lambda to execute to skip a row (it can also be a method name)
# @option options [Integer] :batch_size the batch size for each transform and import step (defaults to 500)
# @option options [Boolean] :parse_headers whether to parse headers or not (defaults to `true`)
# @option options [Array<String>] :mandatory_headers a list of mandatory headers to validate, if required
#
# @return [void]
#
def import_options(options = {})
target_data_model = options[:target_data_model] || raise(UndefinedDataModelError)
append_only = (!options[:append_only].nil?) ? options[:append_only] : false
transform_rows_with = options[:transform_rows_with] || DEFAULT_ROW_TRANSFORM_LAMBDA
skip_row_if = options[:skip_row_if]
batch_size = options[:batch_size] || DEFAULT_BATCH_SIZE
parse_headers = (!options[:parse_headers].nil?) ? options[:parse_headers] : true
mandatory_headers = options[:mandatory_headers] || []

self.target_data_model = target_data_model
self.append_only = append_only
self.parse_headers = parse_headers
self.mandatory_headers = mandatory_headers

if transform_rows_with.is_a?(Symbol)
self.transform_rows_with_method = transform_rows_with
elsif transform_rows_with&.lambda?
self.transform_rows_with_lambda = transform_rows_with
end

self.batch_size = batch_size

return unless skip_row_if

if skip_row_if.is_a?(Symbol)
self.skip_row_if_method = skip_row_if
elsif skip_row_if&.lambda?
self.skip_row_if_lambda = skip_row_if
end

nil
end
end
end
end
7 changes: 7 additions & 0 deletions lib/csv_importer/errors.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
module CsvImporter
class UndefinedDataModelError < StandardError
def message
"You must specify the underlying data model with `import_options target_data_model: CustomModel`"
end
end
end
155 changes: 155 additions & 0 deletions spec/lib/csv_importer/base_spec.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
require "rails_helper"

RSpec.describe CsvImporter::Base do
subject(:importer) { dummy_class.new(file) }

let(:dummy_class) { Class.new(described_class) }
let(:file) {
Tempfile.new.tap { |f|
f.write(csv_str)
f.rewind
}
}
let(:csv_str) { "" }

let(:csv_parse_options) do
{
target_data_model:,
append_only:,
batch_size:,
parse_headers:,
mandatory_headers:,
transform_rows_with:,
skip_row_if:
}
end
let(:target_data_model) { double("TargetDataModel", delete_all: nil, insert_all: nil, is_a?: true) }
let(:append_only) { false }
let(:batch_size) { nil }
let(:parse_headers) { true }
let(:mandatory_headers) { [] }
let(:transform_rows_with) { nil }
let(:skip_row_if) { nil }

before { dummy_class.import_options(**csv_parse_options) }

describe "CSV parsing and validation" do
context "when the file is nil" do
let(:file) { nil }

it "sets an error message" do
expect(importer.errors).to eq(["Select a file"])
end
end

context "when the file is malformed" do
let(:csv_str) { "\"" }

it "sets an error message" do
expect(importer.errors).to eq(["The selected file must be a CSV"])
end
end

context "when the file is valid" do
context "with missing headers" do
let(:mandatory_headers) { ["a", "b", "c", "d"] }
let(:csv_str) { "a,b\n1,2" }

it "sets an error message" do
expect(importer.errors).to eq(["The selected file is missing some expected columns: c, d"])
end

it "parses the CSV" do
expect(importer.rows.map(&:to_h)).to eq([{"a" => "1", "b" => "2"}])
end
end

context "with no missing headers" do
let(:mandatory_headers) { ["a", "b", "c"] }
let(:csv_str) { "a,b,c\n1,2,3" }

it "does not set any error message" do
expect(importer.errors).to be_empty
end

it "parses the CSV" do
expect(importer.rows.map(&:to_h)).to eq([{"a" => "1", "b" => "2", "c" => "3"}])
end
end
end
end

describe "data transformation and importing" do
before { importer.run }

context "with `append_only: false`" do
let(:append_only) { false }
let(:csv_str) { "a,b\n1,2\n3,4" }

let(:expected_records) { [{"a" => "1", "b" => "2"}, {"a" => "3", "b" => "4"}] }

it "issues `delete_all` on the target table and `insert_all` for the processed rows" do
aggregate_failures do
expect(target_data_model).to have_received(:delete_all).ordered
expect(target_data_model).to have_received(:insert_all).with(expected_records).ordered
end
end
end

context "with `append_only: true`" do
let(:append_only) { true }
let(:csv_str) { "a,b\n1,2\n3,4" }

let(:expected_records) { [{"a" => "1", "b" => "2"}, {"a" => "3", "b" => "4"}] }

it "does not issue `delete_all` on the target table, issues `insert_all` for the processed rows" do
aggregate_failures do
expect(target_data_model).not_to have_received(:delete_all)
expect(target_data_model).to have_received(:insert_all).with(expected_records).ordered
end
end
end

context "with a `skip_row_if` lambda" do
let(:csv_str) { "a,b\n1,2\n3,NULL" }
let(:skip_row_if) { ->(row) { row["b"] == "NULL" } }

let(:expected_records) { [{"a" => "1", "b" => "2"}] }

it "issues `insert_all` on the target table for the processed rows" do
expect(target_data_model).to have_received(:insert_all).with(expected_records).ordered
end
end

context "with a `transform_rows_with` lambda" do
let(:csv_str) { "a,b\n1,2\n3,4" }
let(:transform_rows_with) { ->(row) { {custom_key_a: row["a"] + "foo", custom_key_b: row["b"].to_i} } }

let(:expected_records) do
[
{custom_key_a: "1foo", custom_key_b: 2},
{custom_key_a: "3foo", custom_key_b: 4}
]
end

it "issues `insert_all` on the target table for the processed rows" do
expect(target_data_model).to have_received(:insert_all).with(expected_records).ordered
end
end

context "with a `batch_size` value" do
let(:csv_str) { "a,b\n1,2\n3,4\n5,6" }
let(:batch_size) { 2 }

let(:expected_batch_one) { [{"a" => "1", "b" => "2"}, {"a" => "3", "b" => "4"}] }
let(:expected_batch_two) { [{"a" => "5", "b" => "6"}] }

it "issues `insert_all` for each batch on the target table for the processed rows" do
aggregate_failures do
expect(target_data_model).to have_received(:insert_all).with(expected_batch_one).ordered
expect(target_data_model).to have_received(:insert_all).with(expected_batch_two).ordered
end
end
end
end
end
Loading

0 comments on commit 9a92d2d

Please sign in to comment.