-
Notifications
You must be signed in to change notification settings - Fork 0
/
csv2parquet.py
34 lines (22 loc) · 899 Bytes
/
csv2parquet.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
import argparse
import pyarrow as pa
import pyarrow.parquet as pq
from pyarrow import csv
from pyarrow.csv import ConvertOptions
def parse_args():
parser = argparse.ArgumentParser(description='Convert CSV to Parquet')
parser.add_argument('-c', '--csv', help="the path to the csv file to be converted.")
return parser.parse_args()
def convert(csv_file):
out_file = str(csv_file).split('.')[0] + '.parquet'
convert_opts = ConvertOptions(timestamp_parsers=['%Y-%m-%d'], column_types={'date': pa.date32()})
reader = csv.open_csv(csv_file, convert_options=convert_opts)
writer = pq.ParquetWriter(csv_file, reader.schema)
table = pa.Table.from_batches(reader, reader.schema)
writer.write_table(table)
if __name__ == '__main__':
args = parse_args()
if args.csv is None:
print("Where's the CSV file dude")
quit()
convert(args.csv)